search.php

Go to the documentation of this file.
00001 function readInt($file)
00002 {
00003   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
00004   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
00005   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
00006 }
00007 
00008 function readString($file)
00009 {
00010   $result="";
00011   while (ord($c=fgetc($file))) $result.=$c;
00012   return $result;
00013 }
00014 
00015 function readHeader($file)
00016 {
00017   $header =fgetc($file); $header.=fgetc($file);
00018   $header.=fgetc($file); $header.=fgetc($file);
00019   return $header;
00020 }
00021 
00022 function computeIndex($word)
00023 {
00024   // Fast string hashing
00025   //$lword = strtolower($word);
00026   //$l = strlen($lword);
00027   //for ($i=0;$i<$l;$i++)
00028   //{
00029   //  $c = ord($lword{$i});
00030   //  $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;
00031   //}
00032   //return $v;
00033 
00034   // Simple hashing that allows for substring search
00035   if (strlen($word)<2) return -1;
00036   // high char of the index
00037   $hi = ord($word{0});
00038   if ($hi==0) return -1;
00039   // low char of the index
00040   $lo = ord($word{1});
00041   if ($lo==0) return -1;
00042   // return index
00043   return $hi*256+$lo;
00044 }
00045 
00046 function search($file,$word,&$statsList)
00047 {
00048   $index = computeIndex($word);
00049   if ($index!=-1) // found a valid index
00050   {
00051     fseek($file,$index*4+4); // 4 bytes per entry, skip header
00052     $index = readInt($file);
00053     if ($index) // found words matching the hash key
00054     {
00055       $start=sizeof($statsList);
00056       $count=$start;
00057       fseek($file,$index);
00058       $w = readString($file);
00059       while ($w)
00060       {
00061         $statIdx = readInt($file);
00062         if ($word==substr($w,0,strlen($word)))
00063         { // found word that matches (as substring)
00064           $statsList[$count++]=array(
00065               "word"=>$word,
00066               "match"=>$w,
00067               "index"=>$statIdx,
00068               "full"=>strlen($w)==strlen($word),
00069               "docs"=>array()
00070               );
00071         }
00072         $w = readString($file);
00073       }
00074       $totalHi=0;
00075       $totalFreqHi=0;
00076       $totalFreqLo=0;
00077       for ($count=$start;$count<sizeof($statsList);$count++)
00078       {
00079         $statInfo = &$statsList[$count];
00080         $multiplier = 1;
00081         // whole word matches have a double weight
00082         if ($statInfo["full"]) $multiplier=2;
00083         fseek($file,$statInfo["index"]); 
00084         $numDocs = readInt($file);
00085         $docInfo = array();
00086         // read docs info + occurrence frequency of the word
00087         for ($i=0;$i<$numDocs;$i++)
00088         {
00089           $idx=readInt($file); 
00090           $freq=readInt($file); 
00091           $docInfo[$i]=array("idx"  => $idx,
00092                              "freq" => $freq>>1,
00093                              "rank" => 0.0,
00094                              "hi"   => $freq&1
00095                             );
00096           if ($freq&1) // word occurs in high priority doc
00097           {
00098             $totalHi++;
00099             $totalFreqHi+=$freq*$multiplier;
00100           }
00101           else // word occurs in low priority doc
00102           {
00103             $totalFreqLo+=$freq*$multiplier;
00104           }
00105         }
00106         // read name and url info for the doc
00107         for ($i=0;$i<$numDocs;$i++)
00108         {
00109           fseek($file,$docInfo[$i]["idx"]);
00110           $docInfo[$i]["name"]=readString($file);
00111           $docInfo[$i]["url"]=readString($file);
00112         }
00113         $statInfo["docs"]=$docInfo;
00114       }
00115       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
00116       for ($count=$start;$count<sizeof($statsList);$count++)
00117       {
00118         $statInfo = &$statsList[$count];
00119         $multiplier = 1;
00120         // whole word matches have a double weight
00121         if ($statInfo["full"]) $multiplier=2;
00122         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
00123         {
00124           $docInfo = &$statInfo["docs"];
00125           // compute frequency rank of the word in each doc
00126           $freq=$docInfo[$i]["freq"];
00127           if ($docInfo[$i]["hi"])
00128           {
00129             $statInfo["docs"][$i]["rank"]=
00130               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
00131           }
00132           else
00133           {
00134             $statInfo["docs"][$i]["rank"]=
00135               (float)($freq*$multiplier)/$totalFreq;
00136           }
00137         }
00138       }
00139     }
00140   }
00141   return $statsList;
00142 }
00143 
00144 function combine_results($results,&$docs)
00145 {
00146   foreach ($results as $wordInfo)
00147   {
00148     $docsList = &$wordInfo["docs"];
00149     foreach ($docsList as $di)
00150     {
00151       $key=$di["url"];
00152       $rank=$di["rank"];
00153       if (in_array($key, array_keys($docs)))
00154       {
00155         $docs[$key]["rank"]+=$rank;
00156       }
00157       else
00158       {
00159         $docs[$key] = array("url"=>$key,
00160             "name"=>$di["name"],
00161             "rank"=>$rank
00162             );
00163       }
00164       $docs[$key]["words"][] = array(
00165                "word"=>$wordInfo["word"],
00166                "match"=>$wordInfo["match"],
00167                "freq"=>$di["freq"]
00168                );
00169     }
00170   }
00171   return $docs;
00172 }
00173 
00174 function filter_results($docs,&$requiredWords,&$forbiddenWords)
00175 {
00176   $filteredDocs=array();
00177   while (list ($key, $val) = each ($docs)) 
00178   {
00179     $words = &$docs[$key]["words"];
00180     $copy=1; // copy entry by default
00181     if (sizeof($requiredWords)>0)
00182     {
00183       foreach ($requiredWords as $reqWord)
00184       {
00185         $found=0;
00186         foreach ($words as $wordInfo)
00187         { 
00188           $found = $wordInfo["word"]==$reqWord;
00189           if ($found) break;
00190         }
00191         if (!$found) 
00192         {
00193           $copy=0; // document contains none of the required words
00194           break;
00195         }
00196       }
00197     }
00198     if (sizeof($forbiddenWords)>0)
00199     {
00200       foreach ($words as $wordInfo)
00201       {
00202         if (in_array($wordInfo["word"],$forbiddenWords))
00203         {
00204           $copy=0; // document contains a forbidden word
00205           break;
00206         }
00207       }
00208     }
00209     if ($copy) $filteredDocs[$key]=$docs[$key];
00210   }
00211   return $filteredDocs;
00212 }
00213 
00214 function compare_rank($a,$b)
00215 {
00216   if ($a["rank"] == $b["rank"]) 
00217   {
00218     return 0;
00219   }
00220   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
00221 }
00222 
00223 function sort_results($docs,&$sorted)
00224 {
00225   $sorted = $docs;
00226   usort($sorted,"compare_rank");
00227   return $sorted;
00228 }
00229 
00230 function report_results(&$docs)
00231 {
00232   echo "<table cellspacing=\"2\">\n";
00233   echo "  <tr>\n";
00234   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
00235   echo "  </tr>\n";
00236   $numDocs = sizeof($docs);
00237   if ($numDocs==0)
00238   {
00239     echo "  <tr>\n";
00240     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
00241     echo "  </tr>\n";
00242   }
00243   else
00244   {
00245     echo "  <tr>\n";
00246     echo "    <td colspan=\"2\">".matches_text($numDocs);
00247     echo "\n";
00248     echo "    </td>\n";
00249     echo "  </tr>\n";
00250     $num=1;
00251     foreach ($docs as $doc)
00252     {
00253       echo "  <tr>\n";
00254       echo "    <td align=\"right\">$num.</td>";
00255       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
00256       echo "  <tr>\n";
00257       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
00258       foreach ($doc["words"] as $wordInfo)
00259       {
00260         $word = $wordInfo["word"];
00261         $matchRight = substr($wordInfo["match"],strlen($word));
00262         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
00263       }
00264       echo "    </td>\n";
00265       echo "  </tr>\n";
00266       $num++;
00267     }
00268   }
00269   echo "</table>\n";
00270 }
00271 
00272 function main()
00273 {
00274   if(strcmp('4.1.0', phpversion()) > 0) 
00275   {
00276     die("Error: PHP version 4.1.0 or above required!");
00277   }
00278   if (!($file=fopen("search.idx","rb"))) 
00279   {
00280     die("Error: Search index file could NOT be opened!");
00281   }
00282   if (readHeader($file)!="DOXS")
00283   {
00284     die("Error: Header of index file is invalid!");
00285   }
00286   $query="";
00287   if (array_key_exists("query", $_GET))
00288   {
00289     $query=$_GET["query"];
00290   }
00291   end_form(ereg_replace("[^[:alnum:]:\\.\\t ]", " ", $query ));
00292   echo "&nbsp;\n<div class=\"searchresults\">\n";
00293   $results = array();
00294   $requiredWords = array();
00295   $forbiddenWords = array();
00296   $foundWords = array();
00297   $word=strtok($query," ");
00298   while ($word) // for each word in the search query
00299   {
00300     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
00301     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
00302     if (!in_array($word,$foundWords))
00303     {
00304       $foundWords[]=$word;
00305       search($file,strtolower($word),$results);
00306     }
00307     $word=strtok(" ");
00308   }
00309   $docs = array();
00310   combine_results($results,$docs);
00311   // filter out documents with forbidden word or that do not contain
00312   // required words
00313   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
00314   // sort the results based on rank
00315   $sorted = array();
00316   sort_results($filteredDocs,$sorted);
00317   // report results to the user
00318   report_results($sorted);
00319   echo "</div>\n";
00320   fclose($file);
00321 }
00322 
00323 main();
00324 



Generated on Mon Mar 31 10:58:43 2008 by  doxygen 1.5.1