00001 function readInt($file) 00002 { 00003 $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file)); 00004 $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file)); 00005 return ($b1<<24)|($b2<<16)|($b3<<8)|$b4; 00006 } 00007 00008 function readString($file) 00009 { 00010 $result=""; 00011 while (ord($c=fgetc($file))) $result.=$c; 00012 return $result; 00013 } 00014 00015 function readHeader($file) 00016 { 00017 $header =fgetc($file); $header.=fgetc($file); 00018 $header.=fgetc($file); $header.=fgetc($file); 00019 return $header; 00020 } 00021 00022 function computeIndex($word) 00023 { 00024 // Fast string hashing 00025 //$lword = strtolower($word); 00026 //$l = strlen($lword); 00027 //for ($i=0;$i<$l;$i++) 00028 //{ 00029 // $c = ord($lword{$i}); 00030 // $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff; 00031 //} 00032 //return $v; 00033 00034 // Simple hashing that allows for substring search 00035 if (strlen($word)<2) return -1; 00036 // high char of the index 00037 $hi = ord($word{0}); 00038 if ($hi==0) return -1; 00039 // low char of the index 00040 $lo = ord($word{1}); 00041 if ($lo==0) return -1; 00042 // return index 00043 return $hi*256+$lo; 00044 } 00045 00046 function search($file,$word,&$statsList) 00047 { 00048 $index = computeIndex($word); 00049 if ($index!=-1) // found a valid index 00050 { 00051 fseek($file,$index*4+4); // 4 bytes per entry, skip header 00052 $index = readInt($file); 00053 if ($index) // found words matching the hash key 00054 { 00055 $start=sizeof($statsList); 00056 $count=$start; 00057 fseek($file,$index); 00058 $w = readString($file); 00059 while ($w) 00060 { 00061 $statIdx = readInt($file); 00062 if ($word==substr($w,0,strlen($word))) 00063 { // found word that matches (as substring) 00064 $statsList[$count++]=array( 00065 "word"=>$word, 00066 "match"=>$w, 00067 "index"=>$statIdx, 00068 "full"=>strlen($w)==strlen($word), 00069 "docs"=>array() 00070 ); 00071 } 00072 $w = readString($file); 00073 } 00074 $totalHi=0; 00075 $totalFreqHi=0; 00076 $totalFreqLo=0; 00077 for ($count=$start;$count<sizeof($statsList);$count++) 00078 { 00079 $statInfo = &$statsList[$count]; 00080 $multiplier = 1; 00081 // whole word matches have a double weight 00082 if ($statInfo["full"]) $multiplier=2; 00083 fseek($file,$statInfo["index"]); 00084 $numDocs = readInt($file); 00085 $docInfo = array(); 00086 // read docs info + occurrence frequency of the word 00087 for ($i=0;$i<$numDocs;$i++) 00088 { 00089 $idx=readInt($file); 00090 $freq=readInt($file); 00091 $docInfo[$i]=array("idx" => $idx, 00092 "freq" => $freq>>1, 00093 "rank" => 0.0, 00094 "hi" => $freq&1 00095 ); 00096 if ($freq&1) // word occurs in high priority doc 00097 { 00098 $totalHi++; 00099 $totalFreqHi+=$freq*$multiplier; 00100 } 00101 else // word occurs in low priority doc 00102 { 00103 $totalFreqLo+=$freq*$multiplier; 00104 } 00105 } 00106 // read name and url info for the doc 00107 for ($i=0;$i<$numDocs;$i++) 00108 { 00109 fseek($file,$docInfo[$i]["idx"]); 00110 $docInfo[$i]["name"]=readString($file); 00111 $docInfo[$i]["url"]=readString($file); 00112 } 00113 $statInfo["docs"]=$docInfo; 00114 } 00115 $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi; 00116 for ($count=$start;$count<sizeof($statsList);$count++) 00117 { 00118 $statInfo = &$statsList[$count]; 00119 $multiplier = 1; 00120 // whole word matches have a double weight 00121 if ($statInfo["full"]) $multiplier=2; 00122 for ($i=0;$i<sizeof($statInfo["docs"]);$i++) 00123 { 00124 $docInfo = &$statInfo["docs"]; 00125 // compute frequency rank of the word in each doc 00126 $freq=$docInfo[$i]["freq"]; 00127 if ($docInfo[$i]["hi"]) 00128 { 00129 $statInfo["docs"][$i]["rank"]= 00130 (float)($freq*$multiplier+$totalFreqLo)/$totalFreq; 00131 } 00132 else 00133 { 00134 $statInfo["docs"][$i]["rank"]= 00135 (float)($freq*$multiplier)/$totalFreq; 00136 } 00137 } 00138 } 00139 } 00140 } 00141 return $statsList; 00142 } 00143 00144 function combine_results($results,&$docs) 00145 { 00146 foreach ($results as $wordInfo) 00147 { 00148 $docsList = &$wordInfo["docs"]; 00149 foreach ($docsList as $di) 00150 { 00151 $key=$di["url"]; 00152 $rank=$di["rank"]; 00153 if (in_array($key, array_keys($docs))) 00154 { 00155 $docs[$key]["rank"]+=$rank; 00156 } 00157 else 00158 { 00159 $docs[$key] = array("url"=>$key, 00160 "name"=>$di["name"], 00161 "rank"=>$rank 00162 ); 00163 } 00164 $docs[$key]["words"][] = array( 00165 "word"=>$wordInfo["word"], 00166 "match"=>$wordInfo["match"], 00167 "freq"=>$di["freq"] 00168 ); 00169 } 00170 } 00171 return $docs; 00172 } 00173 00174 function filter_results($docs,&$requiredWords,&$forbiddenWords) 00175 { 00176 $filteredDocs=array(); 00177 while (list ($key, $val) = each ($docs)) 00178 { 00179 $words = &$docs[$key]["words"]; 00180 $copy=1; // copy entry by default 00181 if (sizeof($requiredWords)>0) 00182 { 00183 foreach ($requiredWords as $reqWord) 00184 { 00185 $found=0; 00186 foreach ($words as $wordInfo) 00187 { 00188 $found = $wordInfo["word"]==$reqWord; 00189 if ($found) break; 00190 } 00191 if (!$found) 00192 { 00193 $copy=0; // document contains none of the required words 00194 break; 00195 } 00196 } 00197 } 00198 if (sizeof($forbiddenWords)>0) 00199 { 00200 foreach ($words as $wordInfo) 00201 { 00202 if (in_array($wordInfo["word"],$forbiddenWords)) 00203 { 00204 $copy=0; // document contains a forbidden word 00205 break; 00206 } 00207 } 00208 } 00209 if ($copy) $filteredDocs[$key]=$docs[$key]; 00210 } 00211 return $filteredDocs; 00212 } 00213 00214 function compare_rank($a,$b) 00215 { 00216 if ($a["rank"] == $b["rank"]) 00217 { 00218 return 0; 00219 } 00220 return ($a["rank"]>$b["rank"]) ? -1 : 1; 00221 } 00222 00223 function sort_results($docs,&$sorted) 00224 { 00225 $sorted = $docs; 00226 usort($sorted,"compare_rank"); 00227 return $sorted; 00228 } 00229 00230 function report_results(&$docs) 00231 { 00232 echo "<table cellspacing=\"2\">\n"; 00233 echo " <tr>\n"; 00234 echo " <td colspan=\"2\"><h2>".search_results()."</h2></td>\n"; 00235 echo " </tr>\n"; 00236 $numDocs = sizeof($docs); 00237 if ($numDocs==0) 00238 { 00239 echo " <tr>\n"; 00240 echo " <td colspan=\"2\">".matches_text(0)."</td>\n"; 00241 echo " </tr>\n"; 00242 } 00243 else 00244 { 00245 echo " <tr>\n"; 00246 echo " <td colspan=\"2\">".matches_text($numDocs); 00247 echo "\n"; 00248 echo " </td>\n"; 00249 echo " </tr>\n"; 00250 $num=1; 00251 foreach ($docs as $doc) 00252 { 00253 echo " <tr>\n"; 00254 echo " <td align=\"right\">$num.</td>"; 00255 echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n"; 00256 echo " <tr>\n"; 00257 echo " <td></td><td class=\"tiny\">".report_matches()." "; 00258 foreach ($doc["words"] as $wordInfo) 00259 { 00260 $word = $wordInfo["word"]; 00261 $matchRight = substr($wordInfo["match"],strlen($word)); 00262 echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") "; 00263 } 00264 echo " </td>\n"; 00265 echo " </tr>\n"; 00266 $num++; 00267 } 00268 } 00269 echo "</table>\n"; 00270 } 00271 00272 function main() 00273 { 00274 if(strcmp('4.1.0', phpversion()) > 0) 00275 { 00276 die("Error: PHP version 4.1.0 or above required!"); 00277 } 00278 if (!($file=fopen("search.idx","rb"))) 00279 { 00280 die("Error: Search index file could NOT be opened!"); 00281 } 00282 if (readHeader($file)!="DOXS") 00283 { 00284 die("Error: Header of index file is invalid!"); 00285 } 00286 $query=""; 00287 if (array_key_exists("query", $_GET)) 00288 { 00289 $query=$_GET["query"]; 00290 } 00291 end_form(ereg_replace("[^[:alnum:]:\\.\\t ]", " ", $query )); 00292 echo " \n<div class=\"searchresults\">\n"; 00293 $results = array(); 00294 $requiredWords = array(); 00295 $forbiddenWords = array(); 00296 $foundWords = array(); 00297 $word=strtok($query," "); 00298 while ($word) // for each word in the search query 00299 { 00300 if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; } 00301 if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; } 00302 if (!in_array($word,$foundWords)) 00303 { 00304 $foundWords[]=$word; 00305 search($file,strtolower($word),$results); 00306 } 00307 $word=strtok(" "); 00308 } 00309 $docs = array(); 00310 combine_results($results,$docs); 00311 // filter out documents with forbidden word or that do not contain 00312 // required words 00313 $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords); 00314 // sort the results based on rank 00315 $sorted = array(); 00316 sort_results($filteredDocs,$sorted); 00317 // report results to the user 00318 report_results($sorted); 00319 echo "</div>\n"; 00320 fclose($file); 00321 } 00322 00323 main(); 00324