search_php.h

Go to the documentation of this file.
00001 "function readInt($file)\n"
00002 "{\n"
00003 "  $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n"
00004 "  $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));\n"
00005 "  return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;\n"
00006 "}\n"
00007 "\n"
00008 "function readString($file)\n"
00009 "{\n"
00010 "  $result=\"\";\n"
00011 "  while (ord($c=fgetc($file))) $result.=$c;\n"
00012 "  return $result;\n"
00013 "}\n"
00014 "\n"
00015 "function readHeader($file)\n"
00016 "{\n"
00017 "  $header =fgetc($file); $header.=fgetc($file);\n"
00018 "  $header.=fgetc($file); $header.=fgetc($file);\n"
00019 "  return $header;\n"
00020 "}\n"
00021 "\n"
00022 "function computeIndex($word)\n"
00023 "{\n"
00024 "  // Fast string hashing\n"
00025 "  //$lword = strtolower($word);\n"
00026 "  //$l = strlen($lword);\n"
00027 "  //for ($i=0;$i<$l;$i++)\n"
00028 "  //{\n"
00029 "  //  $c = ord($lword{$i});\n"
00030 "  //  $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;\n"
00031 "  //}\n"
00032 "  //return $v;\n"
00033 "\n"
00034 "  // Simple hashing that allows for substring search\n"
00035 "  if (strlen($word)<2) return -1;\n"
00036 "  // high char of the index\n"
00037 "  $hi = ord($word{0});\n"
00038 "  if ($hi==0) return -1;\n"
00039 "  // low char of the index\n"
00040 "  $lo = ord($word{1});\n"
00041 "  if ($lo==0) return -1;\n"
00042 "  // return index\n"
00043 "  return $hi*256+$lo;\n"
00044 "}\n"
00045 "\n"
00046 "function search($file,$word,&$statsList)\n"
00047 "{\n"
00048 "  $index = computeIndex($word);\n"
00049 "  if ($index!=-1) // found a valid index\n"
00050 "  {\n"
00051 "    fseek($file,$index*4+4); // 4 bytes per entry, skip header\n"
00052 "    $index = readInt($file);\n"
00053 "    if ($index) // found words matching the hash key\n"
00054 "    {\n"
00055 "      $start=sizeof($statsList);\n"
00056 "      $count=$start;\n"
00057 "      fseek($file,$index);\n"
00058 "      $w = readString($file);\n"
00059 "      while ($w)\n"
00060 "      {\n"
00061 "        $statIdx = readInt($file);\n"
00062 "        if ($word==substr($w,0,strlen($word)))\n"
00063 "        { // found word that matches (as substring)\n"
00064 "          $statsList[$count++]=array(\n"
00065 "              \"word\"=>$word,\n"
00066 "              \"match\"=>$w,\n"
00067 "              \"index\"=>$statIdx,\n"
00068 "              \"full\"=>strlen($w)==strlen($word),\n"
00069 "              \"docs\"=>array()\n"
00070 "              );\n"
00071 "        }\n"
00072 "        $w = readString($file);\n"
00073 "      }\n"
00074 "      $totalHi=0;\n"
00075 "      $totalFreqHi=0;\n"
00076 "      $totalFreqLo=0;\n"
00077 "      for ($count=$start;$count<sizeof($statsList);$count++)\n"
00078 "      {\n"
00079 "        $statInfo = &$statsList[$count];\n"
00080 "        $multiplier = 1;\n"
00081 "        // whole word matches have a double weight\n"
00082 "        if ($statInfo[\"full\"]) $multiplier=2;\n"
00083 "        fseek($file,$statInfo[\"index\"]); \n"
00084 "        $numDocs = readInt($file);\n"
00085 "        $docInfo = array();\n"
00086 "        // read docs info + occurrence frequency of the word\n"
00087 "        for ($i=0;$i<$numDocs;$i++)\n"
00088 "        {\n"
00089 "          $idx=readInt($file); \n"
00090 "          $freq=readInt($file); \n"
00091 "          $docInfo[$i]=array(\"idx\"  => $idx,\n"
00092 "                             \"freq\" => $freq>>1,\n"
00093 "                             \"rank\" => 0.0,\n"
00094 "                             \"hi\"   => $freq&1\n"
00095 "                            );\n"
00096 "          if ($freq&1) // word occurs in high priority doc\n"
00097 "          {\n"
00098 "            $totalHi++;\n"
00099 "            $totalFreqHi+=$freq*$multiplier;\n"
00100 "          }\n"
00101 "          else // word occurs in low priority doc\n"
00102 "          {\n"
00103 "            $totalFreqLo+=$freq*$multiplier;\n"
00104 "          }\n"
00105 "        }\n"
00106 "        // read name and url info for the doc\n"
00107 "        for ($i=0;$i<$numDocs;$i++)\n"
00108 "        {\n"
00109 "          fseek($file,$docInfo[$i][\"idx\"]);\n"
00110 "          $docInfo[$i][\"name\"]=readString($file);\n"
00111 "          $docInfo[$i][\"url\"]=readString($file);\n"
00112 "        }\n"
00113 "        $statInfo[\"docs\"]=$docInfo;\n"
00114 "      }\n"
00115 "      $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n"
00116 "      for ($count=$start;$count<sizeof($statsList);$count++)\n"
00117 "      {\n"
00118 "        $statInfo = &$statsList[$count];\n"
00119 "        $multiplier = 1;\n"
00120 "        // whole word matches have a double weight\n"
00121 "        if ($statInfo[\"full\"]) $multiplier=2;\n"
00122 "        for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n"
00123 "        {\n"
00124 "          $docInfo = &$statInfo[\"docs\"];\n"
00125 "          // compute frequency rank of the word in each doc\n"
00126 "          $freq=$docInfo[$i][\"freq\"];\n"
00127 "          if ($docInfo[$i][\"hi\"])\n"
00128 "          {\n"
00129 "            $statInfo[\"docs\"][$i][\"rank\"]=\n"
00130 "              (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n"
00131 "          }\n"
00132 "          else\n"
00133 "          {\n"
00134 "            $statInfo[\"docs\"][$i][\"rank\"]=\n"
00135 "              (float)($freq*$multiplier)/$totalFreq;\n"
00136 "          }\n"
00137 "        }\n"
00138 "      }\n"
00139 "    }\n"
00140 "  }\n"
00141 "  return $statsList;\n"
00142 "}\n"
00143 "\n"
00144 "function combine_results($results,&$docs)\n"
00145 "{\n"
00146 "  foreach ($results as $wordInfo)\n"
00147 "  {\n"
00148 "    $docsList = &$wordInfo[\"docs\"];\n"
00149 "    foreach ($docsList as $di)\n"
00150 "    {\n"
00151 "      $key=$di[\"url\"];\n"
00152 "      $rank=$di[\"rank\"];\n"
00153 "      if (in_array($key, array_keys($docs)))\n"
00154 "      {\n"
00155 "        $docs[$key][\"rank\"]+=$rank;\n"
00156 "      }\n"
00157 "      else\n"
00158 "      {\n"
00159 "        $docs[$key] = array(\"url\"=>$key,\n"
00160 "            \"name\"=>$di[\"name\"],\n"
00161 "            \"rank\"=>$rank\n"
00162 "            );\n"
00163 "      }\n"
00164 "      $docs[$key][\"words\"][] = array(\n"
00165 "               \"word\"=>$wordInfo[\"word\"],\n"
00166 "               \"match\"=>$wordInfo[\"match\"],\n"
00167 "               \"freq\"=>$di[\"freq\"]\n"
00168 "               );\n"
00169 "    }\n"
00170 "  }\n"
00171 "  return $docs;\n"
00172 "}\n"
00173 "\n"
00174 "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n"
00175 "{\n"
00176 "  $filteredDocs=array();\n"
00177 "  while (list ($key, $val) = each ($docs)) \n"
00178 "  {\n"
00179 "    $words = &$docs[$key][\"words\"];\n"
00180 "    $copy=1; // copy entry by default\n"
00181 "    if (sizeof($requiredWords)>0)\n"
00182 "    {\n"
00183 "      foreach ($requiredWords as $reqWord)\n"
00184 "      {\n"
00185 "        $found=0;\n"
00186 "        foreach ($words as $wordInfo)\n"
00187 "        { \n"
00188 "          $found = $wordInfo[\"word\"]==$reqWord;\n"
00189 "          if ($found) break;\n"
00190 "        }\n"
00191 "        if (!$found) \n"
00192 "        {\n"
00193 "          $copy=0; // document contains none of the required words\n"
00194 "          break;\n"
00195 "        }\n"
00196 "      }\n"
00197 "    }\n"
00198 "    if (sizeof($forbiddenWords)>0)\n"
00199 "    {\n"
00200 "      foreach ($words as $wordInfo)\n"
00201 "      {\n"
00202 "        if (in_array($wordInfo[\"word\"],$forbiddenWords))\n"
00203 "        {\n"
00204 "          $copy=0; // document contains a forbidden word\n"
00205 "          break;\n"
00206 "        }\n"
00207 "      }\n"
00208 "    }\n"
00209 "    if ($copy) $filteredDocs[$key]=$docs[$key];\n"
00210 "  }\n"
00211 "  return $filteredDocs;\n"
00212 "}\n"
00213 "\n"
00214 "function compare_rank($a,$b)\n"
00215 "{\n"
00216 "  if ($a[\"rank\"] == $b[\"rank\"]) \n"
00217 "  {\n"
00218 "    return 0;\n"
00219 "  }\n"
00220 "  return ($a[\"rank\"]>$b[\"rank\"]) ? -1 : 1; \n"
00221 "}\n"
00222 "\n"
00223 "function sort_results($docs,&$sorted)\n"
00224 "{\n"
00225 "  $sorted = $docs;\n"
00226 "  usort($sorted,\"compare_rank\");\n"
00227 "  return $sorted;\n"
00228 "}\n"
00229 "\n"
00230 "function report_results(&$docs)\n"
00231 "{\n"
00232 "  echo \"<table cellspacing=\\\"2\\\">\\n\";\n"
00233 "  echo \"  <tr>\\n\";\n"
00234 "  echo \"    <td colspan=\\\"2\\\"><h2>\".search_results().\"</h2></td>\\n\";\n"
00235 "  echo \"  </tr>\\n\";\n"
00236 "  $numDocs = sizeof($docs);\n"
00237 "  if ($numDocs==0)\n"
00238 "  {\n"
00239 "    echo \"  <tr>\\n\";\n"
00240 "    echo \"    <td colspan=\\\"2\\\">\".matches_text(0).\"</td>\\n\";\n"
00241 "    echo \"  </tr>\\n\";\n"
00242 "  }\n"
00243 "  else\n"
00244 "  {\n"
00245 "    echo \"  <tr>\\n\";\n"
00246 "    echo \"    <td colspan=\\\"2\\\">\".matches_text($numDocs);\n"
00247 "    echo \"\\n\";\n"
00248 "    echo \"    </td>\\n\";\n"
00249 "    echo \"  </tr>\\n\";\n"
00250 "    $num=1;\n"
00251 "    foreach ($docs as $doc)\n"
00252 "    {\n"
00253 "      echo \"  <tr>\\n\";\n"
00254 "      echo \"    <td align=\\\"right\\\">$num.</td>\";\n"
00255 "      echo     \"<td><a class=\\\"el\\\" href=\\\"\".$doc[\"url\"].\"\\\">\".$doc[\"name\"].\"</a></td>\\n\";\n"
00256 "      echo \"  <tr>\\n\";\n"
00257 "      echo \"    <td></td><td class=\\\"tiny\\\">\".report_matches().\" \";\n"
00258 "      foreach ($doc[\"words\"] as $wordInfo)\n"
00259 "      {\n"
00260 "        $word = $wordInfo[\"word\"];\n"
00261 "        $matchRight = substr($wordInfo[\"match\"],strlen($word));\n"
00262 "        echo \"<b>$word</b>$matchRight(\".$wordInfo[\"freq\"].\") \";\n"
00263 "      }\n"
00264 "      echo \"    </td>\\n\";\n"
00265 "      echo \"  </tr>\\n\";\n"
00266 "      $num++;\n"
00267 "    }\n"
00268 "  }\n"
00269 "  echo \"</table>\\n\";\n"
00270 "}\n"
00271 "\n"
00272 "function main()\n"
00273 "{\n"
00274 "  if(strcmp('4.1.0', phpversion()) > 0) \n"
00275 "  {\n"
00276 "    die(\"Error: PHP version 4.1.0 or above required!\");\n"
00277 "  }\n"
00278 "  if (!($file=fopen(\"search.idx\",\"rb\"))) \n"
00279 "  {\n"
00280 "    die(\"Error: Search index file could NOT be opened!\");\n"
00281 "  }\n"
00282 "  if (readHeader($file)!=\"DOXS\")\n"
00283 "  {\n"
00284 "    die(\"Error: Header of index file is invalid!\");\n"
00285 "  }\n"
00286 "  $query=\"\";\n"
00287 "  if (array_key_exists(\"query\", $_GET))\n"
00288 "  {\n"
00289 "    $query=$_GET[\"query\"];\n"
00290 "  }\n"
00291 "  end_form(ereg_replace(\"[^[:alnum:]:\\\\.\\\\t ]\", \" \", $query ));\n"
00292 "  echo \"&nbsp;\\n<div class=\\\"searchresults\\\">\\n\";\n"
00293 "  $results = array();\n"
00294 "  $requiredWords = array();\n"
00295 "  $forbiddenWords = array();\n"
00296 "  $foundWords = array();\n"
00297 "  $word=strtok($query,\" \");\n"
00298 "  while ($word) // for each word in the search query\n"
00299 "  {\n"
00300 "    if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }\n"
00301 "    if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }\n"
00302 "    if (!in_array($word,$foundWords))\n"
00303 "    {\n"
00304 "      $foundWords[]=$word;\n"
00305 "      search($file,strtolower($word),$results);\n"
00306 "    }\n"
00307 "    $word=strtok(\" \");\n"
00308 "  }\n"
00309 "  $docs = array();\n"
00310 "  combine_results($results,$docs);\n"
00311 "  // filter out documents with forbidden word or that do not contain\n"
00312 "  // required words\n"
00313 "  $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n"
00314 "  // sort the results based on rank\n"
00315 "  $sorted = array();\n"
00316 "  sort_results($filteredDocs,$sorted);\n"
00317 "  // report results to the user\n"
00318 "  report_results($sorted);\n"
00319 "  echo \"</div>\\n\";\n"
00320 "  fclose($file);\n"
00321 "}\n"
00322 "\n"
00323 "main();\n"
00324 "\n"



Generated on Mon Mar 31 10:58:43 2008 by  doxygen 1.5.1