searchindex.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002  *
00003  * $Id: searchindex.cpp,v 1.7 2001/03/19 19:27:41 root Exp $
00004  *
00005  * Copyright (C) 1997-2008 by Dimitri van Heesch.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and its
00008  * documentation under the terms of the GNU General Public License is hereby 
00009  * granted. No representations are made about the suitability of this software 
00010  * for any purpose. It is provided "as is" without express or implied warranty.
00011  * See the GNU General Public License for more details.
00012  *
00013  * Documents produced by Doxygen are derivative works derived from the
00014  * input used in their production; they are not affected by this license.
00015  *
00016  */
00017 
00018 #include "qtbc.h"
00019 #include "searchindex.h"
00020 #include "config.h"
00021 #include <qfile.h>
00022 #include <ctype.h>
00023 
00024 
00025 // file format: (all multi-byte values are stored in big endian format)
00026 //   4 byte header
00027 //   256*256*4 byte index (4 bytes)
00028 //   for each index entry: a zero terminated list of words 
00029 //   for each word: a \0 terminated string + 4 byte offset to the stats info
00030 //   padding bytes to align at 4 byte boundary
00031 //   for each word: the number of urls (4 bytes) 
00032 //               + for each url containing the word 8 bytes statistics
00033 //                 (4 bytes index to url string + 4 bytes frequency counter)
00034 //   for each url: a \0 terminated string
00035 
00036 const int numIndexEntries = 256*256;
00037 
00038 //--------------------------------------------------------------------
00039 
00040 IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
00041 {
00042   m_urls.setAutoDelete(TRUE);
00043   //printf("IndexWord::IndexWord(%s)\n",word);
00044 }
00045 
00046 void IndexWord::addUrlIndex(int idx,bool hiPriority)
00047 {
00048   //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
00049   URLInfo *ui = m_urls.find(idx);
00050   if (ui==0)
00051   {
00052     //printf("URLInfo::URLInfo(%d)\n",idx);
00053     ui=new URLInfo(idx,0);
00054     m_urls.insert(idx,ui);
00055   }
00056   ui->freq+=2;
00057   if (hiPriority) ui->freq|=1; // mark as high priority document
00058 }
00059 
00060 //--------------------------------------------------------------------
00061 
00062 SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
00063 {
00064   int i;
00065   m_words.setAutoDelete(TRUE);
00066   m_urls.setAutoDelete(TRUE);
00067   m_index.setAutoDelete(TRUE);
00068   for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
00069 }
00070 
00071 void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
00072 {
00073   if (name==0 || baseName==0) return;
00074   //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
00075   QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
00076   if (anchor) url+=(QCString)"#"+anchor;  
00077   m_urlIndex++;
00078   m_urls.insert(m_urlIndex,new URL(name,url));
00079 }
00080 
00081 static int charsToIndex(const char *word)
00082 {
00083   if (word==0) return -1;
00084 
00085   // Fast string hashing algorithm
00086   //register ushort h=0;
00087   //const char *k = word;
00088   //ushort mask=0xfc00;
00089   //while ( *k ) 
00090   //{
00091   //  h = (h&mask)^(h<<6)^(*k++);
00092   //}
00093   //return h;
00094 
00095   // Simple hashing that allows for substring searching
00096   uint c1=word[0];
00097   if (c1==0) return -1;
00098   uint c2=word[1];
00099   if (c2==0) return -1;
00100   return c1*256+c2;
00101 }
00102 
00103 void SearchIndex::addWord(const char *word,bool hiPriority)
00104 {
00105   //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
00106   //QString wStr=QString(word).lower();
00107   QString wStr(word);
00108   if (wStr.isEmpty()) return;
00109   wStr=wStr.lower();
00110   IndexWord *w = m_words[wStr];
00111   if (w==0)
00112   {
00113     int idx=charsToIndex(wStr);
00114     if (idx<0) return;
00115     w = new IndexWord(wStr);
00116     //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
00117     m_index[idx]->append(w);
00118     m_words.insert(wStr,w);
00119   }
00120   w->addUrlIndex(m_urlIndex,hiPriority);
00121 }
00122 
00123 
00124 static void writeInt(QFile &f,int index)
00125 {
00126   f.putch(((uint)index)>>24);
00127   f.putch((((uint)index)>>16)&0xff);
00128   f.putch((((uint)index)>>8)&0xff);
00129   f.putch(((uint)index)&0xff);
00130 }
00131 
00132 static void writeString(QFile &f,const char *s)
00133 {
00134   const char *p = s;
00135   while (*p) f.putch(*p++);
00136   f.putch(0);
00137 }
00138 
00139 void SearchIndex::write(const char *fileName)
00140 {
00141   int i;
00142   int size=4; // for the header
00143   size+=4*numIndexEntries; // for the index
00144   int wordsOffset = size;
00145   // first pass: compute the size of the wordlist
00146   for (i=0;i<numIndexEntries;i++)
00147   {
00148     QList<IndexWord> *wlist = m_index[i];
00149     if (!wlist->isEmpty())
00150     {
00151       QListIterator<IndexWord> iwi(*wlist);
00152       IndexWord *iw;
00153       for (iwi.toFirst();(iw=iwi.current());++iwi)
00154       {
00155         int ws = iw->word().length()+1; 
00156         size+=ws+4; // word + url info list offset
00157       }
00158       size+=1; // zero list terminator
00159     }
00160   }
00161 
00162   // second pass: compute the offsets in the index
00163   int indexOffsets[numIndexEntries];
00164   int offset=wordsOffset;
00165   for (i=0;i<numIndexEntries;i++)
00166   {
00167     QList<IndexWord> *wlist = m_index[i];
00168     if (!wlist->isEmpty())
00169     {
00170       indexOffsets[i]=offset;
00171       QListIterator<IndexWord> iwi(*wlist);
00172       IndexWord *iw;
00173       for (iwi.toFirst();(iw=iwi.current());++iwi)
00174       {
00175         offset+= iw->word().length()+1; 
00176         offset+=4; // word + offset to url info array 
00177       }
00178       offset+=1; // zero list terminator
00179     }
00180     else
00181     {
00182       indexOffsets[i]=0;
00183     }
00184   }
00185   int padding = size;
00186   size = (size+3)&~3; // round up to 4 byte boundary
00187   padding = size - padding;
00188 
00189   //int statsOffset = size;
00190   QDictIterator<IndexWord> wdi(m_words);
00191   //IndexWord *iw;
00192   int *wordStatOffsets = new int[m_words.count()];
00193   
00194   int count=0;
00195 
00196   // third pass: compute offset to stats info for each word
00197   for (i=0;i<numIndexEntries;i++)
00198   {
00199     QList<IndexWord> *wlist = m_index[i];
00200     if (!wlist->isEmpty())
00201     {
00202       QListIterator<IndexWord> iwi(*wlist);
00203       IndexWord *iw;
00204       for (iwi.toFirst();(iw=iwi.current());++iwi)
00205       {
00206         //printf("wordStatOffsets[%d]=%d\n",count,size);
00207         wordStatOffsets[count++] = size;
00208         size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
00209       }
00210     }
00211   }
00212   int *urlOffsets = new int[m_urls.count()];
00213   //int urlsOffset = size;
00214   QIntDictIterator<URL> udi(m_urls);
00215   URL *url;
00216   for (udi.toFirst();(url=udi.current());++udi)
00217   {
00218     urlOffsets[udi.currentKey()]=size;
00219     size+=url->name.length()+1+
00220           url->url.length()+1;
00221   }
00222   //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
00223   QFile f(fileName);
00224   if (f.open(IO_WriteOnly))
00225   {
00226     // write header
00227     f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
00228     // write index
00229     for (i=0;i<numIndexEntries;i++)
00230     {
00231       writeInt(f,indexOffsets[i]);
00232     }
00233     // write word lists
00234     count=0;
00235     for (i=0;i<numIndexEntries;i++)
00236     {
00237       QList<IndexWord> *wlist = m_index[i];
00238       if (!wlist->isEmpty())
00239       {
00240         QListIterator<IndexWord> iwi(*wlist);
00241         IndexWord *iw;
00242         for (iwi.toFirst();(iw=iwi.current());++iwi)
00243         {
00244           writeString(f,iw->word());
00245           writeInt(f,wordStatOffsets[count++]);
00246         }
00247         f.putch(0);
00248       }
00249     }
00250     // write extra padding bytes
00251     for (i=0;i<padding;i++) f.putch(0);
00252     // write word statistics
00253     for (i=0;i<numIndexEntries;i++)
00254     {
00255       QList<IndexWord> *wlist = m_index[i];
00256       if (!wlist->isEmpty())
00257       {
00258         QListIterator<IndexWord> iwi(*wlist);
00259         IndexWord *iw;
00260         for (iwi.toFirst();(iw=iwi.current());++iwi)
00261         {
00262           int numUrls = iw->urls().count();
00263           writeInt(f,numUrls);
00264           QIntDictIterator<URLInfo> uli(iw->urls());
00265           URLInfo *ui;
00266           for (uli.toFirst();(ui=uli.current());++uli)
00267           {
00268             writeInt(f,urlOffsets[ui->urlIdx]);
00269             writeInt(f,ui->freq);
00270           }
00271         }
00272       }
00273     }
00274     // write urls
00275     QIntDictIterator<URL> udi(m_urls);
00276     URL *url;
00277     for (udi.toFirst();(url=udi.current());++udi)
00278     {
00279       writeString(f,url->name);
00280       writeString(f,url->url);
00281     }
00282   }
00283 
00284   delete[] urlOffsets;
00285   delete[] wordStatOffsets;
00286 }
00287 



Generated on Mon Mar 31 10:58:43 2008 by  doxygen 1.5.1