00001 "function readInt($file)\n" 00002 "{\n" 00003 " $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n" 00004 " $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));\n" 00005 " return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;\n" 00006 "}\n" 00007 "\n" 00008 "function readString($file)\n" 00009 "{\n" 00010 " $result=\"\";\n" 00011 " while (ord($c=fgetc($file))) $result.=$c;\n" 00012 " return $result;\n" 00013 "}\n" 00014 "\n" 00015 "function readHeader($file)\n" 00016 "{\n" 00017 " $header =fgetc($file); $header.=fgetc($file);\n" 00018 " $header.=fgetc($file); $header.=fgetc($file);\n" 00019 " return $header;\n" 00020 "}\n" 00021 "\n" 00022 "function computeIndex($word)\n" 00023 "{\n" 00024 " // Fast string hashing\n" 00025 " //$lword = strtolower($word);\n" 00026 " //$l = strlen($lword);\n" 00027 " //for ($i=0;$i<$l;$i++)\n" 00028 " //{\n" 00029 " // $c = ord($lword{$i});\n" 00030 " // $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;\n" 00031 " //}\n" 00032 " //return $v;\n" 00033 "\n" 00034 " // Simple hashing that allows for substring search\n" 00035 " if (strlen($word)<2) return -1;\n" 00036 " // high char of the index\n" 00037 " $hi = ord($word{0});\n" 00038 " if ($hi==0) return -1;\n" 00039 " // low char of the index\n" 00040 " $lo = ord($word{1});\n" 00041 " if ($lo==0) return -1;\n" 00042 " // return index\n" 00043 " return $hi*256+$lo;\n" 00044 "}\n" 00045 "\n" 00046 "function search($file,$word,&$statsList)\n" 00047 "{\n" 00048 " $index = computeIndex($word);\n" 00049 " if ($index!=-1) // found a valid index\n" 00050 " {\n" 00051 " fseek($file,$index*4+4); // 4 bytes per entry, skip header\n" 00052 " $index = readInt($file);\n" 00053 " if ($index) // found words matching the hash key\n" 00054 " {\n" 00055 " $start=sizeof($statsList);\n" 00056 " $count=$start;\n" 00057 " fseek($file,$index);\n" 00058 " $w = readString($file);\n" 00059 " while ($w)\n" 00060 " {\n" 00061 " $statIdx = readInt($file);\n" 00062 " if ($word==substr($w,0,strlen($word)))\n" 00063 " { // found word that matches (as substring)\n" 00064 " $statsList[$count++]=array(\n" 00065 " \"word\"=>$word,\n" 00066 " \"match\"=>$w,\n" 00067 " \"index\"=>$statIdx,\n" 00068 " \"full\"=>strlen($w)==strlen($word),\n" 00069 " \"docs\"=>array()\n" 00070 " );\n" 00071 " }\n" 00072 " $w = readString($file);\n" 00073 " }\n" 00074 " $totalHi=0;\n" 00075 " $totalFreqHi=0;\n" 00076 " $totalFreqLo=0;\n" 00077 " for ($count=$start;$count<sizeof($statsList);$count++)\n" 00078 " {\n" 00079 " $statInfo = &$statsList[$count];\n" 00080 " $multiplier = 1;\n" 00081 " // whole word matches have a double weight\n" 00082 " if ($statInfo[\"full\"]) $multiplier=2;\n" 00083 " fseek($file,$statInfo[\"index\"]); \n" 00084 " $numDocs = readInt($file);\n" 00085 " $docInfo = array();\n" 00086 " // read docs info + occurrence frequency of the word\n" 00087 " for ($i=0;$i<$numDocs;$i++)\n" 00088 " {\n" 00089 " $idx=readInt($file); \n" 00090 " $freq=readInt($file); \n" 00091 " $docInfo[$i]=array(\"idx\" => $idx,\n" 00092 " \"freq\" => $freq>>1,\n" 00093 " \"rank\" => 0.0,\n" 00094 " \"hi\" => $freq&1\n" 00095 " );\n" 00096 " if ($freq&1) // word occurs in high priority doc\n" 00097 " {\n" 00098 " $totalHi++;\n" 00099 " $totalFreqHi+=$freq*$multiplier;\n" 00100 " }\n" 00101 " else // word occurs in low priority doc\n" 00102 " {\n" 00103 " $totalFreqLo+=$freq*$multiplier;\n" 00104 " }\n" 00105 " }\n" 00106 " // read name and url info for the doc\n" 00107 " for ($i=0;$i<$numDocs;$i++)\n" 00108 " {\n" 00109 " fseek($file,$docInfo[$i][\"idx\"]);\n" 00110 " $docInfo[$i][\"name\"]=readString($file);\n" 00111 " $docInfo[$i][\"url\"]=readString($file);\n" 00112 " }\n" 00113 " $statInfo[\"docs\"]=$docInfo;\n" 00114 " }\n" 00115 " $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n" 00116 " for ($count=$start;$count<sizeof($statsList);$count++)\n" 00117 " {\n" 00118 " $statInfo = &$statsList[$count];\n" 00119 " $multiplier = 1;\n" 00120 " // whole word matches have a double weight\n" 00121 " if ($statInfo[\"full\"]) $multiplier=2;\n" 00122 " for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n" 00123 " {\n" 00124 " $docInfo = &$statInfo[\"docs\"];\n" 00125 " // compute frequency rank of the word in each doc\n" 00126 " $freq=$docInfo[$i][\"freq\"];\n" 00127 " if ($docInfo[$i][\"hi\"])\n" 00128 " {\n" 00129 " $statInfo[\"docs\"][$i][\"rank\"]=\n" 00130 " (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n" 00131 " }\n" 00132 " else\n" 00133 " {\n" 00134 " $statInfo[\"docs\"][$i][\"rank\"]=\n" 00135 " (float)($freq*$multiplier)/$totalFreq;\n" 00136 " }\n" 00137 " }\n" 00138 " }\n" 00139 " }\n" 00140 " }\n" 00141 " return $statsList;\n" 00142 "}\n" 00143 "\n" 00144 "function combine_results($results,&$docs)\n" 00145 "{\n" 00146 " foreach ($results as $wordInfo)\n" 00147 " {\n" 00148 " $docsList = &$wordInfo[\"docs\"];\n" 00149 " foreach ($docsList as $di)\n" 00150 " {\n" 00151 " $key=$di[\"url\"];\n" 00152 " $rank=$di[\"rank\"];\n" 00153 " if (in_array($key, array_keys($docs)))\n" 00154 " {\n" 00155 " $docs[$key][\"rank\"]+=$rank;\n" 00156 " }\n" 00157 " else\n" 00158 " {\n" 00159 " $docs[$key] = array(\"url\"=>$key,\n" 00160 " \"name\"=>$di[\"name\"],\n" 00161 " \"rank\"=>$rank\n" 00162 " );\n" 00163 " }\n" 00164 " $docs[$key][\"words\"][] = array(\n" 00165 " \"word\"=>$wordInfo[\"word\"],\n" 00166 " \"match\"=>$wordInfo[\"match\"],\n" 00167 " \"freq\"=>$di[\"freq\"]\n" 00168 " );\n" 00169 " }\n" 00170 " }\n" 00171 " return $docs;\n" 00172 "}\n" 00173 "\n" 00174 "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n" 00175 "{\n" 00176 " $filteredDocs=array();\n" 00177 " while (list ($key, $val) = each ($docs)) \n" 00178 " {\n" 00179 " $words = &$docs[$key][\"words\"];\n" 00180 " $copy=1; // copy entry by default\n" 00181 " if (sizeof($requiredWords)>0)\n" 00182 " {\n" 00183 " foreach ($requiredWords as $reqWord)\n" 00184 " {\n" 00185 " $found=0;\n" 00186 " foreach ($words as $wordInfo)\n" 00187 " { \n" 00188 " $found = $wordInfo[\"word\"]==$reqWord;\n" 00189 " if ($found) break;\n" 00190 " }\n" 00191 " if (!$found) \n" 00192 " {\n" 00193 " $copy=0; // document contains none of the required words\n" 00194 " break;\n" 00195 " }\n" 00196 " }\n" 00197 " }\n" 00198 " if (sizeof($forbiddenWords)>0)\n" 00199 " {\n" 00200 " foreach ($words as $wordInfo)\n" 00201 " {\n" 00202 " if (in_array($wordInfo[\"word\"],$forbiddenWords))\n" 00203 " {\n" 00204 " $copy=0; // document contains a forbidden word\n" 00205 " break;\n" 00206 " }\n" 00207 " }\n" 00208 " }\n" 00209 " if ($copy) $filteredDocs[$key]=$docs[$key];\n" 00210 " }\n" 00211 " return $filteredDocs;\n" 00212 "}\n" 00213 "\n" 00214 "function compare_rank($a,$b)\n" 00215 "{\n" 00216 " if ($a[\"rank\"] == $b[\"rank\"]) \n" 00217 " {\n" 00218 " return 0;\n" 00219 " }\n" 00220 " return ($a[\"rank\"]>$b[\"rank\"]) ? -1 : 1; \n" 00221 "}\n" 00222 "\n" 00223 "function sort_results($docs,&$sorted)\n" 00224 "{\n" 00225 " $sorted = $docs;\n" 00226 " usort($sorted,\"compare_rank\");\n" 00227 " return $sorted;\n" 00228 "}\n" 00229 "\n" 00230 "function report_results(&$docs)\n" 00231 "{\n" 00232 " echo \"<table cellspacing=\\\"2\\\">\\n\";\n" 00233 " echo \" <tr>\\n\";\n" 00234 " echo \" <td colspan=\\\"2\\\"><h2>\".search_results().\"</h2></td>\\n\";\n" 00235 " echo \" </tr>\\n\";\n" 00236 " $numDocs = sizeof($docs);\n" 00237 " if ($numDocs==0)\n" 00238 " {\n" 00239 " echo \" <tr>\\n\";\n" 00240 " echo \" <td colspan=\\\"2\\\">\".matches_text(0).\"</td>\\n\";\n" 00241 " echo \" </tr>\\n\";\n" 00242 " }\n" 00243 " else\n" 00244 " {\n" 00245 " echo \" <tr>\\n\";\n" 00246 " echo \" <td colspan=\\\"2\\\">\".matches_text($numDocs);\n" 00247 " echo \"\\n\";\n" 00248 " echo \" </td>\\n\";\n" 00249 " echo \" </tr>\\n\";\n" 00250 " $num=1;\n" 00251 " foreach ($docs as $doc)\n" 00252 " {\n" 00253 " echo \" <tr>\\n\";\n" 00254 " echo \" <td align=\\\"right\\\">$num.</td>\";\n" 00255 " echo \"<td><a class=\\\"el\\\" href=\\\"\".$doc[\"url\"].\"\\\">\".$doc[\"name\"].\"</a></td>\\n\";\n" 00256 " echo \" <tr>\\n\";\n" 00257 " echo \" <td></td><td class=\\\"tiny\\\">\".report_matches().\" \";\n" 00258 " foreach ($doc[\"words\"] as $wordInfo)\n" 00259 " {\n" 00260 " $word = $wordInfo[\"word\"];\n" 00261 " $matchRight = substr($wordInfo[\"match\"],strlen($word));\n" 00262 " echo \"<b>$word</b>$matchRight(\".$wordInfo[\"freq\"].\") \";\n" 00263 " }\n" 00264 " echo \" </td>\\n\";\n" 00265 " echo \" </tr>\\n\";\n" 00266 " $num++;\n" 00267 " }\n" 00268 " }\n" 00269 " echo \"</table>\\n\";\n" 00270 "}\n" 00271 "\n" 00272 "function main()\n" 00273 "{\n" 00274 " if(strcmp('4.1.0', phpversion()) > 0) \n" 00275 " {\n" 00276 " die(\"Error: PHP version 4.1.0 or above required!\");\n" 00277 " }\n" 00278 " if (!($file=fopen(\"search.idx\",\"rb\"))) \n" 00279 " {\n" 00280 " die(\"Error: Search index file could NOT be opened!\");\n" 00281 " }\n" 00282 " if (readHeader($file)!=\"DOXS\")\n" 00283 " {\n" 00284 " die(\"Error: Header of index file is invalid!\");\n" 00285 " }\n" 00286 " $query=\"\";\n" 00287 " if (array_key_exists(\"query\", $_GET))\n" 00288 " {\n" 00289 " $query=$_GET[\"query\"];\n" 00290 " }\n" 00291 " end_form(ereg_replace(\"[^[:alnum:]:\\\\.\\\\t ]\", \" \", $query ));\n" 00292 " echo \" \\n<div class=\\\"searchresults\\\">\\n\";\n" 00293 " $results = array();\n" 00294 " $requiredWords = array();\n" 00295 " $forbiddenWords = array();\n" 00296 " $foundWords = array();\n" 00297 " $word=strtok($query,\" \");\n" 00298 " while ($word) // for each word in the search query\n" 00299 " {\n" 00300 " if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }\n" 00301 " if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }\n" 00302 " if (!in_array($word,$foundWords))\n" 00303 " {\n" 00304 " $foundWords[]=$word;\n" 00305 " search($file,strtolower($word),$results);\n" 00306 " }\n" 00307 " $word=strtok(\" \");\n" 00308 " }\n" 00309 " $docs = array();\n" 00310 " combine_results($results,$docs);\n" 00311 " // filter out documents with forbidden word or that do not contain\n" 00312 " // required words\n" 00313 " $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n" 00314 " // sort the results based on rank\n" 00315 " $sorted = array();\n" 00316 " sort_results($filteredDocs,$sorted);\n" 00317 " // report results to the user\n" 00318 " report_results($sorted);\n" 00319 " echo \"</div>\\n\";\n" 00320 " fclose($file);\n" 00321 "}\n" 00322 "\n" 00323 "main();\n" 00324 "\n"