search.php

Go to the documentation of this file.
00001 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
00002 <html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
00003 <title>Search</title>
00004 <link href="doxygen.css" rel="stylesheet" type="text/css">
00005 <link href="tabs.css" rel="stylesheet" type="text/css">
00006 </head><body>
00007 <!-- Generated by Doxygen 1.4.7 -->
00008 <div class="tabs">
00009   <ul>
00010     <li><a href="index.html"><span>Main&nbsp;Page</span></a></li>
00011     <li><a href="namespaces.html"><span>Namespaces</span></a></li>
00012     <li><a href="classes.html"><span>Classes</span></a></li>
00013     <li><a href="files.html"><span>Files</span></a></li>
00014     <li><a href="dirs.html"><span>Directories</span></a></li>
00015     <li>
00016       <form action="search.php" method="get">
00017         <table cellspacing="0" cellpadding="0" border="0">
00018           <tr>
00019             <td><label>&nbsp;<u>S</u>earch&nbsp;for&nbsp;</label></td>
00020 
00021 <?php
00022 
00023 function search_results()
00024 {
00025   return "Search Results";
00026 }
00027 
00028 function matches_text($num)
00029 {
00030   if ($num==0)
00031   {
00032     return "Sorry, no documents matching your query.";
00033   }
00034   else if ($num==1)
00035   {
00036     return "Found <b>1</b> document matching your query.";
00037   }
00038   else // $num>1
00039   {
00040     return "Found <b>$num</b> documents matching your query. Showing best matches first.";
00041   }
00042 }
00043 
00044 function report_matches()
00045 {
00046   return "Matches: ";
00047 }
00048 function end_form($value)
00049 {
00050   echo "            <td><input type=\"text\" name=\"query\" value=\"$value\" size=\"20\" accesskey=\"s\"/></td>\n          </tr>\n        </table>\n      </form>\n    </li>\n  </ul>\n</div>\n";
00051 }
00052 
00053 function readInt($file)
00054 {
00055   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
00056   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
00057   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
00058 }
00059 
00060 function readString($file)
00061 {
00062   $result="";
00063   while (ord($c=fgetc($file))) $result.=$c;
00064   return $result;
00065 }
00066 
00067 function readHeader($file)
00068 {
00069   $header =fgetc($file); $header.=fgetc($file);
00070   $header.=fgetc($file); $header.=fgetc($file);
00071   return $header;
00072 }
00073 
00074 function computeIndex($word)
00075 {
00076   // Fast string hashing
00077   //$lword = strtolower($word);
00078   //$l = strlen($lword);
00079   //for ($i=0;$i<$l;$i++)
00080   //{
00081   //  $c = ord($lword{$i});
00082   //  $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;
00083   //}
00084   //return $v;
00085 
00086   // Simple hashing that allows for substring search
00087   if (strlen($word)<2) return -1;
00088   // high char of the index
00089   $hi = ord($word{0});
00090   if ($hi==0) return -1;
00091   // low char of the index
00092   $lo = ord($word{1});
00093   if ($lo==0) return -1;
00094   // return index
00095   return $hi*256+$lo;
00096 }
00097 
00098 function search($file,$word,&$statsList)
00099 {
00100   $index = computeIndex($word);
00101   if ($index!=-1) // found a valid index
00102   {
00103     fseek($file,$index*4+4); // 4 bytes per entry, skip header
00104     $index = readInt($file);
00105     if ($index) // found words matching the hash key
00106     {
00107       $start=sizeof($statsList);
00108       $count=$start;
00109       fseek($file,$index);
00110       $w = readString($file);
00111       while ($w)
00112       {
00113         $statIdx = readInt($file);
00114         if ($word==substr($w,0,strlen($word)))
00115         { // found word that matches (as substring)
00116           $statsList[$count++]=array(
00117               "word"=>$word,
00118               "match"=>$w,
00119               "index"=>$statIdx,
00120               "full"=>strlen($w)==strlen($word),
00121               "docs"=>array()
00122               );
00123         }
00124         $w = readString($file);
00125       }
00126       $totalHi=0;
00127       $totalFreqHi=0;
00128       $totalFreqLo=0;
00129       for ($count=$start;$count<sizeof($statsList);$count++)
00130       {
00131         $statInfo = &$statsList[$count];
00132         $multiplier = 1;
00133         // whole word matches have a double weight
00134         if ($statInfo["full"]) $multiplier=2;
00135         fseek($file,$statInfo["index"]); 
00136         $numDocs = readInt($file);
00137         $docInfo = array();
00138         // read docs info + occurrence frequency of the word
00139         for ($i=0;$i<$numDocs;$i++)
00140         {
00141           $idx=readInt($file); 
00142           $freq=readInt($file); 
00143           $docInfo[$i]=array("idx"  => $idx,
00144                              "freq" => $freq>>1,
00145                              "rank" => 0.0,
00146                              "hi"   => $freq&1
00147                             );
00148           if ($freq&1) // word occurs in high priority doc
00149           {
00150             $totalHi++;
00151             $totalFreqHi+=$freq*$multiplier;
00152           }
00153           else // word occurs in low priority doc
00154           {
00155             $totalFreqLo+=$freq*$multiplier;
00156           }
00157         }
00158         // read name and url info for the doc
00159         for ($i=0;$i<$numDocs;$i++)
00160         {
00161           fseek($file,$docInfo[$i]["idx"]);
00162           $docInfo[$i]["name"]=readString($file);
00163           $docInfo[$i]["url"]=readString($file);
00164         }
00165         $statInfo["docs"]=$docInfo;
00166       }
00167       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
00168       for ($count=$start;$count<sizeof($statsList);$count++)
00169       {
00170         $statInfo = &$statsList[$count];
00171         $multiplier = 1;
00172         // whole word matches have a double weight
00173         if ($statInfo["full"]) $multiplier=2;
00174         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
00175         {
00176           $docInfo = &$statInfo["docs"];
00177           // compute frequency rank of the word in each doc
00178           $freq=$docInfo[$i]["freq"];
00179           if ($docInfo[$i]["hi"])
00180           {
00181             $statInfo["docs"][$i]["rank"]=
00182               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
00183           }
00184           else
00185           {
00186             $statInfo["docs"][$i]["rank"]=
00187               (float)($freq*$multiplier)/$totalFreq;
00188           }
00189         }
00190       }
00191     }
00192   }
00193   return $statsList;
00194 }
00195 
00196 function combine_results($results,&$docs)
00197 {
00198   foreach ($results as $wordInfo)
00199   {
00200     $docsList = &$wordInfo["docs"];
00201     foreach ($docsList as $di)
00202     {
00203       $key=$di["url"];
00204       $rank=$di["rank"];
00205       if (in_array($key, array_keys($docs)))
00206       {
00207         $docs[$key]["rank"]+=$rank;
00208       }
00209       else
00210       {
00211         $docs[$key] = array("url"=>$key,
00212             "name"=>$di["name"],
00213             "rank"=>$rank
00214             );
00215       }
00216       $docs[$key]["words"][] = array(
00217                "word"=>$wordInfo["word"],
00218                "match"=>$wordInfo["match"],
00219                "freq"=>$di["freq"]
00220                );
00221     }
00222   }
00223   return $docs;
00224 }
00225 
00226 function filter_results($docs,&$requiredWords,&$forbiddenWords)
00227 {
00228   $filteredDocs=array();
00229   while (list ($key, $val) = each ($docs)) 
00230   {
00231     $words = &$docs[$key]["words"];
00232     $copy=1; // copy entry by default
00233     if (sizeof($requiredWords)>0)
00234     {
00235       foreach ($requiredWords as $reqWord)
00236       {
00237         $found=0;
00238         foreach ($words as $wordInfo)
00239         { 
00240           $found = $wordInfo["word"]==$reqWord;
00241           if ($found) break;
00242         }
00243         if (!$found) 
00244         {
00245           $copy=0; // document contains none of the required words
00246           break;
00247         }
00248       }
00249     }
00250     if (sizeof($forbiddenWords)>0)
00251     {
00252       foreach ($words as $wordInfo)
00253       {
00254         if (in_array($wordInfo["word"],$forbiddenWords))
00255         {
00256           $copy=0; // document contains a forbidden word
00257           break;
00258         }
00259       }
00260     }
00261     if ($copy) $filteredDocs[$key]=$docs[$key];
00262   }
00263   return $filteredDocs;
00264 }
00265 
00266 function compare_rank($a,$b)
00267 {
00268   if ($a["rank"] == $b["rank"]) 
00269   {
00270     return 0;
00271   }
00272   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
00273 }
00274 
00275 function sort_results($docs,&$sorted)
00276 {
00277   $sorted = $docs;
00278   usort($sorted,"compare_rank");
00279   return $sorted;
00280 }
00281 
00282 function report_results(&$docs)
00283 {
00284   echo "<table cellspacing=\"2\">\n";
00285   echo "  <tr>\n";
00286   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
00287   echo "  </tr>\n";
00288   $numDocs = sizeof($docs);
00289   if ($numDocs==0)
00290   {
00291     echo "  <tr>\n";
00292     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
00293     echo "  </tr>\n";
00294   }
00295   else
00296   {
00297     echo "  <tr>\n";
00298     echo "    <td colspan=\"2\">".matches_text($numDocs);
00299     echo "\n";
00300     echo "    </td>\n";
00301     echo "  </tr>\n";
00302     $num=1;
00303     foreach ($docs as $doc)
00304     {
00305       echo "  <tr>\n";
00306       echo "    <td align=\"right\">$num.</td>";
00307       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
00308       echo "  <tr>\n";
00309       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
00310       foreach ($doc["words"] as $wordInfo)
00311       {
00312         $word = $wordInfo["word"];
00313         $matchRight = substr($wordInfo["match"],strlen($word));
00314         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
00315       }
00316       echo "    </td>\n";
00317       echo "  </tr>\n";
00318       $num++;
00319     }
00320   }
00321   echo "</table>\n";
00322 }
00323 
00324 function main()
00325 {
00326   if(strcmp('4.1.0', phpversion()) > 0) 
00327   {
00328     die("Error: PHP version 4.1.0 or above required!");
00329   }
00330   if (!($file=fopen("search.idx","rb"))) 
00331   {
00332     die("Error: Search index file could NOT be opened!");
00333   }
00334   if (readHeader($file)!="DOXS")
00335   {
00336     die("Error: Header of index file is invalid!");
00337   }
00338   $query="";
00339   if (array_key_exists("query", $_GET))
00340   {
00341     $query=$_GET["query"];
00342   }
00343   end_form($query);
00344   echo "&nbsp;\n<div class=\"searchresults\">\n";
00345   $results = array();
00346   $requiredWords = array();
00347   $forbiddenWords = array();
00348   $foundWords = array();
00349   $word=strtok($query," ");
00350   while ($word) // for each word in the search query
00351   {
00352     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
00353     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
00354     if (!in_array($word,$foundWords))
00355     {
00356       $foundWords[]=$word;
00357       search($file,strtolower($word),$results);
00358     }
00359     $word=strtok(" ");
00360   }
00361   $docs = array();
00362   combine_results($results,$docs);
00363   // filter out documents with forbidden word or that do not contain
00364   // required words
00365   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
00366   // sort the results based on rank
00367   $sorted = array();
00368   sort_results($filteredDocs,$sorted);
00369   // report results to the user
00370   report_results($sorted);
00371   echo "</div>\n";
00372   fclose($file);
00373 }
00374 
00375 main();
00376 
00377 
00378 ?>
00379 <hr size="1"><address style="align: right;"><small>Generated on Wed May 12 06:12:06 2010 for Second Life Viewer by&nbsp;
00380 <a href="http://www.doxygen.org/index.html">
00381 <img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.4.7 </small></address>
00382 </body>
00383 </html>

Generated on Thu Jul 1 06:09:59 2010 for Second Life Viewer by  doxygen 1.4.7