00001 /****************************************************************************** 00002 * 00003 * $Id: searchindex.cpp,v 1.7 2001/03/19 19:27:41 root Exp $ 00004 * 00005 * Copyright (C) 1997-2008 by Dimitri van Heesch. 00006 * 00007 * Permission to use, copy, modify, and distribute this software and its 00008 * documentation under the terms of the GNU General Public License is hereby 00009 * granted. No representations are made about the suitability of this software 00010 * for any purpose. It is provided "as is" without express or implied warranty. 00011 * See the GNU General Public License for more details. 00012 * 00013 * Documents produced by Doxygen are derivative works derived from the 00014 * input used in their production; they are not affected by this license. 00015 * 00016 */ 00017 00018 #include "qtbc.h" 00019 #include "searchindex.h" 00020 #include "config.h" 00021 #include <qfile.h> 00022 #include <ctype.h> 00023 00024 00025 // file format: (all multi-byte values are stored in big endian format) 00026 // 4 byte header 00027 // 256*256*4 byte index (4 bytes) 00028 // for each index entry: a zero terminated list of words 00029 // for each word: a \0 terminated string + 4 byte offset to the stats info 00030 // padding bytes to align at 4 byte boundary 00031 // for each word: the number of urls (4 bytes) 00032 // + for each url containing the word 8 bytes statistics 00033 // (4 bytes index to url string + 4 bytes frequency counter) 00034 // for each url: a \0 terminated string 00035 00036 const int numIndexEntries = 256*256; 00037 00038 //-------------------------------------------------------------------- 00039 00040 IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17) 00041 { 00042 m_urls.setAutoDelete(TRUE); 00043 //printf("IndexWord::IndexWord(%s)\n",word); 00044 } 00045 00046 void IndexWord::addUrlIndex(int idx,bool hiPriority) 00047 { 00048 //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority); 00049 URLInfo *ui = m_urls.find(idx); 00050 if (ui==0) 00051 { 00052 //printf("URLInfo::URLInfo(%d)\n",idx); 00053 ui=new URLInfo(idx,0); 00054 m_urls.insert(idx,ui); 00055 } 00056 ui->freq+=2; 00057 if (hiPriority) ui->freq|=1; // mark as high priority document 00058 } 00059 00060 //-------------------------------------------------------------------- 00061 00062 SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1) 00063 { 00064 int i; 00065 m_words.setAutoDelete(TRUE); 00066 m_urls.setAutoDelete(TRUE); 00067 m_index.setAutoDelete(TRUE); 00068 for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>); 00069 } 00070 00071 void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor) 00072 { 00073 if (name==0 || baseName==0) return; 00074 //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor); 00075 QCString url=baseName+Config_getString("HTML_FILE_EXTENSION"); 00076 if (anchor) url+=(QCString)"#"+anchor; 00077 m_urlIndex++; 00078 m_urls.insert(m_urlIndex,new URL(name,url)); 00079 } 00080 00081 static int charsToIndex(const char *word) 00082 { 00083 if (word==0) return -1; 00084 00085 // Fast string hashing algorithm 00086 //register ushort h=0; 00087 //const char *k = word; 00088 //ushort mask=0xfc00; 00089 //while ( *k ) 00090 //{ 00091 // h = (h&mask)^(h<<6)^(*k++); 00092 //} 00093 //return h; 00094 00095 // Simple hashing that allows for substring searching 00096 uint c1=word[0]; 00097 if (c1==0) return -1; 00098 uint c2=word[1]; 00099 if (c2==0) return -1; 00100 return c1*256+c2; 00101 } 00102 00103 void SearchIndex::addWord(const char *word,bool hiPriority) 00104 { 00105 //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority); 00106 //QString wStr=QString(word).lower(); 00107 QString wStr(word); 00108 if (wStr.isEmpty()) return; 00109 wStr=wStr.lower(); 00110 IndexWord *w = m_words[wStr]; 00111 if (w==0) 00112 { 00113 int idx=charsToIndex(wStr); 00114 if (idx<0) return; 00115 w = new IndexWord(wStr); 00116 //fprintf(stderr,"addWord(%s) at index %d\n",word,idx); 00117 m_index[idx]->append(w); 00118 m_words.insert(wStr,w); 00119 } 00120 w->addUrlIndex(m_urlIndex,hiPriority); 00121 } 00122 00123 00124 static void writeInt(QFile &f,int index) 00125 { 00126 f.putch(((uint)index)>>24); 00127 f.putch((((uint)index)>>16)&0xff); 00128 f.putch((((uint)index)>>8)&0xff); 00129 f.putch(((uint)index)&0xff); 00130 } 00131 00132 static void writeString(QFile &f,const char *s) 00133 { 00134 const char *p = s; 00135 while (*p) f.putch(*p++); 00136 f.putch(0); 00137 } 00138 00139 void SearchIndex::write(const char *fileName) 00140 { 00141 int i; 00142 int size=4; // for the header 00143 size+=4*numIndexEntries; // for the index 00144 int wordsOffset = size; 00145 // first pass: compute the size of the wordlist 00146 for (i=0;i<numIndexEntries;i++) 00147 { 00148 QList<IndexWord> *wlist = m_index[i]; 00149 if (!wlist->isEmpty()) 00150 { 00151 QListIterator<IndexWord> iwi(*wlist); 00152 IndexWord *iw; 00153 for (iwi.toFirst();(iw=iwi.current());++iwi) 00154 { 00155 int ws = iw->word().length()+1; 00156 size+=ws+4; // word + url info list offset 00157 } 00158 size+=1; // zero list terminator 00159 } 00160 } 00161 00162 // second pass: compute the offsets in the index 00163 int indexOffsets[numIndexEntries]; 00164 int offset=wordsOffset; 00165 for (i=0;i<numIndexEntries;i++) 00166 { 00167 QList<IndexWord> *wlist = m_index[i]; 00168 if (!wlist->isEmpty()) 00169 { 00170 indexOffsets[i]=offset; 00171 QListIterator<IndexWord> iwi(*wlist); 00172 IndexWord *iw; 00173 for (iwi.toFirst();(iw=iwi.current());++iwi) 00174 { 00175 offset+= iw->word().length()+1; 00176 offset+=4; // word + offset to url info array 00177 } 00178 offset+=1; // zero list terminator 00179 } 00180 else 00181 { 00182 indexOffsets[i]=0; 00183 } 00184 } 00185 int padding = size; 00186 size = (size+3)&~3; // round up to 4 byte boundary 00187 padding = size - padding; 00188 00189 //int statsOffset = size; 00190 QDictIterator<IndexWord> wdi(m_words); 00191 //IndexWord *iw; 00192 int *wordStatOffsets = new int[m_words.count()]; 00193 00194 int count=0; 00195 00196 // third pass: compute offset to stats info for each word 00197 for (i=0;i<numIndexEntries;i++) 00198 { 00199 QList<IndexWord> *wlist = m_index[i]; 00200 if (!wlist->isEmpty()) 00201 { 00202 QListIterator<IndexWord> iwi(*wlist); 00203 IndexWord *iw; 00204 for (iwi.toFirst();(iw=iwi.current());++iwi) 00205 { 00206 //printf("wordStatOffsets[%d]=%d\n",count,size); 00207 wordStatOffsets[count++] = size; 00208 size+=4+iw->urls().count()*8; // count + (url_index,freq) per url 00209 } 00210 } 00211 } 00212 int *urlOffsets = new int[m_urls.count()]; 00213 //int urlsOffset = size; 00214 QIntDictIterator<URL> udi(m_urls); 00215 URL *url; 00216 for (udi.toFirst();(url=udi.current());++udi) 00217 { 00218 urlOffsets[udi.currentKey()]=size; 00219 size+=url->name.length()+1+ 00220 url->url.length()+1; 00221 } 00222 //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset); 00223 QFile f(fileName); 00224 if (f.open(IO_WriteOnly)) 00225 { 00226 // write header 00227 f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S'); 00228 // write index 00229 for (i=0;i<numIndexEntries;i++) 00230 { 00231 writeInt(f,indexOffsets[i]); 00232 } 00233 // write word lists 00234 count=0; 00235 for (i=0;i<numIndexEntries;i++) 00236 { 00237 QList<IndexWord> *wlist = m_index[i]; 00238 if (!wlist->isEmpty()) 00239 { 00240 QListIterator<IndexWord> iwi(*wlist); 00241 IndexWord *iw; 00242 for (iwi.toFirst();(iw=iwi.current());++iwi) 00243 { 00244 writeString(f,iw->word()); 00245 writeInt(f,wordStatOffsets[count++]); 00246 } 00247 f.putch(0); 00248 } 00249 } 00250 // write extra padding bytes 00251 for (i=0;i<padding;i++) f.putch(0); 00252 // write word statistics 00253 for (i=0;i<numIndexEntries;i++) 00254 { 00255 QList<IndexWord> *wlist = m_index[i]; 00256 if (!wlist->isEmpty()) 00257 { 00258 QListIterator<IndexWord> iwi(*wlist); 00259 IndexWord *iw; 00260 for (iwi.toFirst();(iw=iwi.current());++iwi) 00261 { 00262 int numUrls = iw->urls().count(); 00263 writeInt(f,numUrls); 00264 QIntDictIterator<URLInfo> uli(iw->urls()); 00265 URLInfo *ui; 00266 for (uli.toFirst();(ui=uli.current());++uli) 00267 { 00268 writeInt(f,urlOffsets[ui->urlIdx]); 00269 writeInt(f,ui->freq); 00270 } 00271 } 00272 } 00273 } 00274 // write urls 00275 QIntDictIterator<URL> udi(m_urls); 00276 URL *url; 00277 for (udi.toFirst();(url=udi.current());++udi) 00278 { 00279 writeString(f,url->name); 00280 writeString(f,url->url); 00281 } 00282 } 00283 00284 delete[] urlOffsets; 00285 delete[] wordStatOffsets; 00286 } 00287