Documentation is available at lucene-fileindex-defs.php
- <?php
- /* ******************************************************************** */
- /* CATALYST PHP Source Code */
- /* -------------------------------------------------------------------- */
- /* This program is free software; you can redistribute it and/or modify */
- /* it under the terms of the GNU General Public License as published by */
- /* the Free Software Foundation; either version 2 of the License, or */
- /* (at your option) any later version. */
- /* */
- /* This program is distributed in the hope that it will be useful, */
- /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
- /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
- /* GNU General Public License for more details. */
- /* */
- /* You should have received a copy of the GNU General Public License */
- /* along with this program; if not, write to: */
- /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
- /* Boston, MA 02111-1307 USA */
- /* -------------------------------------------------------------------- */
- /* */
- /* Filename: lucene-fileindex-defs.php */
- /* Author: Paul Waite */
- /* Description: Search Engine Module */
- /* Specialised indexing class for indexing file content. */
- /* Still tied to the deprecated lucene-defs.php module. */
- /* */
- /* ******************************************************************** */
- /** @package search */* The file indexer class.
- * This class indexes files on disc, either one by one or as a whole
- * file hierarchy tree.
- * @package search
- */
- class fileindexer {
- // Public
- /** Application we are indexing for */
- var $application = "";
- /** Host to connect to */
- var $host = "";
- /** Port to connect to */
- var $port = "";
- // Private
- /** The index ID
- @access private */
- var $ixid;
- /** ID generation source
- @access private */
- var $idsource = ID_FROM_INC;
- /** Scan for meta tags as fields in file content. Recommended.
- @access private */
- var $metascan = true;
- /** Meta fields definitions array. Contains definitions
- for the fields we will process if found as meta tags.
- @access private */
- var $meta_fields = array();
- /** Index fields definitions array. Contains definitions
- for the fields we are expecting to index.
- @access private */
- var $field_definitions = array();
- /** Fields for indexing. This is an array of fieldname/value
- pairs which should be added during the indexing. These
- fields do not have to appear in $field_definitions.
- @access private */
- var $indexfields = array();
- /** ID generation offset
- @access private */
- var $idoffset = 0;
- /** ID generation prefix
- @access private */
- var $idprefix = "";
- /** Timeout for indexing commands in seconds (can usually leave
- as nullstring)
- @access private */
- var $timeoutsecs = "";
- /** Path to a lockfile we should give way to. If this value
- is not nullstring, then no indexing will be done while the
- file exists. If lockfile_wait is > 0, then we only wait
- this many seconds.
- @access private */
- var $lockfile = "";
- /** Number of seconds to wait on a lockfile. If zero, wait forever.
- @access private */
- var $lockfile_wait_secs = 0;
- /** Indexing execution timer
- @access private */
- var $timer;
- // .....................................................................
- /**
- * Constructor
- * Create a new file indexer
- * @param string $application Application name
- * @param string $host Hostname or IP of search engine server
- * @param string $port Port of search engine server
- */
- function fileindexer($application="?", $host="", $port="") {
- // Store for reference..
- $this->application = $application;
- $this->host = $host;
- $this->port = $port;
- $this->timer = new microtimer();
- } // fileindexer
- // .....................................................................
- /**
- * Define a field. We supply the name of the field, it's type (Text, Date
- * or Id), and whether it should be stored by the search engine for later
- * retreival in queries. For example you would not store the raw
- * document/content as this is usually stored elsewhere.
- * IMPORTANT NOTE: Fields defined here will automatically be included as
- * meta fields.
- * @see meta_fields()
- * @param string $fieldname Name of the field to index
- * @param string $type Type of field data: Text, Date or Id.
- * @param boolean $stored If true then search engine will store the content itself
- * @param boolean $indexed If true then search engine will index the field content
- */
- function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
- $this->field_definitions[$fieldname]
- = $type . "|" . (($stored) ? "true" : "false") . "|" . (($indexed) ? "true" : "false");
- // Register for meta tags..
- $this->meta_field($fieldname, $type);
- } // define_field
- // .....................................................................
- /**
- * Define a lockfile which we must avoid during indexing. If defined
- * then no indexing will take place while the lockfile exists. The
- * second parameter allows you to specify a limit to the patience of
- * this process, in seconds. Zero means wait forever.
- * @param string $lockfile Path to the lockfile. Nullstring = not defined
- * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
- */
- function avoid_lockfile($lockfile, $wait_secs=0) {
- $this->lockfile = $lockfile;
- $this->lockfile_wait_secs = $wait_secs;
- } // avoid_lockfile
- // .....................................................................
- /**
- * Define a field as a meta tag. This ensures that the field will be
- * picked up from the file meta tags, if present. If it is not listed
- * here then it will be ignored.
- * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
- * been defined here can be added to the indexing via the meta tag scanning.
- * Ie. you must define fields here explicitly, or via the define_field()
- * method, or they will be ignored even if they turn up as a meta tag.
- * This is so we can restrict the indexing, and be sure of field types.
- * @see define_field()
- * @param string $fieldname Name of the field to process as meta tag
- * @param string $type Type of field data: Text, Date or Id.
- */
- function meta_field($fieldname, $type) {
- $this->meta_fields[$fieldname] = $type;
- } // meta_field
- // .....................................................................
- /**
- * Supply field content for indexing. This causes the search engine to take
- * the given fieldname and index the given value against it.
- * The field name can have the field type included in the form 'Foo:Date',
- * where 'Date' is the type in this instance. In fact, since 'Text' is the
- * default filed type, 'Date' is probably the only one you need to use
- * as the current implementation stands.
- * @param string $fieldname Name of the field to index.
- * @param string $fieldvalue Content of the field to index
- */
- function index_field($fieldname, $fieldvalue) {
- $this->indexfields[$fieldname] = $fieldvalue;
- } // index_field
- // .....................................................................
- /**
- * Set the source for ID generation. Since we are indexing a bunch of
- * files, the ID's have to be generated on demand inside the loop. So
- * we provide for various ways here, and you can extend this class to
- * provide more if required.
- * Main ways:
- * ID_FROM_INC Increment a counter by 1 each time (with offset)
- * ID_FROM_NAME Take the filename, strip the extension, add prefix
- * ID_FROM_FILENAME Take the full filename, add prefix
- * ID_FROM_PATH Take the full file path
- * NB: These are all defined as integer constants.
- * @param integer $idsource Source of ID generation
- * @param mixed $pfxofs String prefix, or integer offset
- */
- function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
- $this->idsource = $idsource;
- if ($pfxofs != "") {
- if (is_string($pfxofs)) {
- $this->idprefix = $pfxofs;
- }
- else {
- $this->idoffset = (int)$pfxofs;
- }
- }
- } // id_generate
- // .....................................................................
- /**
- * Flag that we should do a tag scan on the content of the files to try
- * and extract fields to index. Note that any tags thus found will only
- * be used if the field name has been defined with the method define_field();
- * This causes both the <title> tag and <meta> tags to be considered.
- * @see fileindexer::define_field()
- */
- function scantags() {
- $this->metascan = true;
- } // scantags
- // .....................................................................
- /**
- * Flag that we should NOT do a tag scan on the content of the files.
- */
- function noscantags() {
- $this->metascan = false;
- } // noscantags
- // .....................................................................
- /**
- * Index a file located at the given path, using given ID.
- * You can also use the parameter $fields to supply an array of
- * fieldname/value pairs to index with this file, for one-off indexing of
- * files. If the fieldname is a date field, make sure to define the
- * name as 'Foo:Date', to cause the field definition to be correct.
- * @param string $path Path to the head of the file tree to index
- * @param string $id ID to associate with the indexed file content
- * @param mixed $fields Array of field/values to index with file
- */
- function index_file($path, $id, $fields=false) {
- $success = false;
- $f = new inputfile($path);
- if ($f->opened) {
- $f->readall();
- $f->closefile();
- // Wait for a lockfile, if we really have to..
- if ($this->lockfile != "" && file_exists($this->lockfile)) {
- $waitforit = true;
- debugbr("waiting for lockfile..", DBG_DEBUG);
- if ($this->lockfile_wait_secs > 0) {
- $locktimer = new microtimer();
- $locktimer->start();
- }
- do {
- clearstatcache();
- if (!file_exists($this->lockfile)) {
- $waitforit = false;
- debugbr("lockfile has been removed..", DBG_DEBUG);
- }
- elseif ($this->lockfile_wait_secs > 0 && $locktimer->secs() >= $this->lockfile_wait_secs) {
- $waitforit = false;
- debugbr("lockfile wait (" . $this->lockfile_wait_secs ."secs) timed out..", DBG_DEBUG);
- }
- else {
- sleep(1);
- }
- } while ($waitforit === true);
- }
- // Create the index message..
- $ix = new lucene_indexmsg($this->application, $this->host, $this->port);
- // Define the fields for the index message..
- foreach ($this->field_definitions as $fieldname => $attributes) {
- $bits = explode("|", $attributes);
- $type = $bits[0];
- $stored = (strcasecmp($bits[1], "true") == 0);
- $indexed = (strcasecmp($bits[2], "true") == 0);
- $ix->define_field($fieldname, $type, $stored, $indexed);
- }
- // Scan file content for meta tags for index fields..
- $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
- $content = preg_replace("/[\xc2][\xb7]./", "", $content);
- $content = preg_replace("/[\xc2]&/", " ", $content);
- $content = preg_replace("/[\xc3]&/", " ", $content);
- if ($this->metascan) {
- $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
- $matches = array();
- if (preg_match_all($tagpat, $content, $matches)) {
- for ($i=0; $i < count($matches[0]); $i++) {
- $fieldname = $matches[1][$i];
- $fieldvalue = $matches[2][$i];
- if (isset($this->meta_fields[$fieldname])) {
- // Get type..
- $type = $this->meta_fields[$fieldname];
- if (!strcasecmp($type, "date")) {
- // Newsquest date field format requires stripping off a prefix
- // 'DT' - a temporary hack which should be completely transparent
- // to everyone else using this. NB: originally NewsQuest only
- // stored date in 'DTdd/mm/yyyy' format. This parsing is also
- // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
- if (substr($fieldvalue, 0, 2) == "DT") {
- $fieldvalue = substr($fieldvalue, 2);
- }
- // Need to convert to Unix timestamp..
- $ts = displaydate_to_timestamp($fieldvalue);
- $fieldvalue = $ts;
- }
- debugbr("meta tag index field: $fieldname=$fieldvalue");
- $ix->index_field($fieldname, $fieldvalue);
- }
- else {
- debugbr("rejected unlisted tag field: $fieldname");
- }
- }
- }
- // Check for title tag in HTML page if required field..
- if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
- $fieldname = $matches[1];
- $fieldvalue = $matches[2];
- if (isset($this->meta_fields[$fieldname])) {
- $type = $this->meta_fields[$fieldname];
- debugbr("title tag index field: $fieldname=$fieldvalue");
- $ix->index_field($fieldname, $fieldvalue);
- }
- }
- } // metascan
- // Deal with passed-in field settings. These are meant to cater
- // for indexing of individual files using this method. We just
- // add them to any existing field/values already set up..
- if ($fields) {
- reset($fields);
- while (list($fieldname, $fieldvalue) = each($fields)) {
- $this->index_field($fieldname, $fieldvalue);
- }
- }
- // Process field/value pairs which have been added either by the
- // index_field() method, or passed in via the $fields parameter..
- if (count($this->indexfields) > 0) {
- reset($this->indexfields);
- while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
- $bits = explode(":", $fieldname);
- $type = ((isset($bits[1])) ? $bits[1] : "Text");
- $fieldname = $bits[0];
- debugbr("index field: $fieldname=$fieldvalue");
- $ix->define_field($fieldname, $type);
- $ix->index_field($fieldname, $fieldvalue);
- }
- }
- // Index the file content. We get rid of any HTML tags..
- debugbr("indexing file: $path, ID=$id");
- $ix->index_content($id, strip_tags($content));
- // Send the index message to the search engine. We specify a large
- // timeout since we really want this to succeed and search engine
- // may be in an optimization fugue..
- $success = $ix->send(120);
- if(!$success) {
- debugbr("failed: $ix->error_msg");
- }
- }
- else {
- debugbr("open failed on '$path'");
- }
- return $success;
- } // index_file
- // .....................................................................
- /**
- * Index a tree of files starting at the path given. We index these in one
- * of four modes, which determines how we generate the ID for each item:
- * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
- * holds a number, the counter will start at this number instead of one.
- * Each item has an ID incremented by one from the last one.
- * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
- * as the ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * 'ID_FROM_FILENAME' mode uses the filename, including any extension
- * as the ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
- * ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * The file will simply be indexed as a single Text field, with the
- * appropriate ID, and no other index fields unless $metascan is set to TRUE.
- * If this is the case, the system will scan the file for HTML meta tags of
- * form: '<meta name="foo" content="bar">'. In this example a field of name
- *'foo' would be given value 'bar'.
- * @param string $path Path to the head of the file tree to index
- * @param $patt Pattern to match, eg. '*.html'
- * @param $restart If equal to "restart" then treat $path as file of paths
- * @param $lockfile If path is set, we idle whilst this file exists
- * @param string $lockfile Path to the lockfile. Nullstring = not defined
- * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
- */
- function index_tree($path, $patt="", $restart="", $lockfile="", $wait_secs=0) {
- // Set up any lockfile definition..
- $this->avoid_lockfile($lockfile, $wait_secs);
- if ($restart == "restart") {
- // Restart from existing paths file..
- $tmpfname = $path;
- debugbr("restarting with existing item list $path", DBG_DEBUG);
- }
- else {
- // Use find to generate item list to a temporary file..
- debugbr("generating item list", DBG_DEBUG);
- $tmpfname = tempnam("/tmp", "LU");
- $cmd = "find $path";
- if ($patt != "") $cmd .= " -name \"$patt\"";
- $cmd .= " >$tmpfname";
- exec($cmd);
- }
- $treelist = new inputfile($tmpfname);
- if ($treelist->opened) {
- // Find the number of items..
- debugbr("counting items", DBG_DEBUG);
- $todo = (int) exec("cat $tmpfname|wc -l");
- if ($todo > 0) {
- $done = 0; $succeeded = 0; $failed = 0; $last = 0;
- debugbr("$todo items to index", DBG_DEBUG);
- $this->timer->start();
- $idix = 0;
- if ($this->idsource == ID_FROM_INC) {
- $idix += $this->idoffset;
- }
- while ($path = $treelist->readln()) {
- // Generate an ID to use..
- switch ($this->idsource) {
- case ID_FROM_INC:
- // Use incremented index..
- $id = $idix + 1;
- $idix += 1;
- break;
- case ID_FROM_NAME:
- // Use filename, minus extenaion..
- $fname = basename($path);
- if (strstr($fname, ".")) {
- $bits = explode(".", $fname);
- $dummy = array_pop($bits);
- $fname = implode(".", $bits);
- }
- $id = $this->idprefix . $fname;
- break;
- case ID_FROM_FILENAME:
- // Use full filename..
- $id = $this->idprefix . basename($path);
- break;
- case ID_FROM_PATH:
- // Use full file path..
- $id = $this->idprefix . $path;
- break;
- } // switch
- // Index the file with new ID..
- if ($this->index_file($path, $id)) {
- debugbr("$id indexed", DBG_DEBUG);
- $succeeded += 1;
- }
- else {
- debugbr("$path index failed", DBG_DEBUG);
- //break;
- $failed += 1;
- }
- // Progress check..
- $done += 1;
- // If the verbose output option is enabled, we compile
- // stats and display these via the debugger..
- if (debugging()) {
- $pct = ($done / $todo) * 100;
- $pct_int = (int)(floor($pct));
- $pct_mod = $pct % 5;
- if ($pct_mod == 0 && $pct_int > $last) {
- $secperdoc = $this->timer->secs() / $done;
- $timedone = $this->timer->formatted_time();
- $timeleft = nicetime(($todo - $done) * $secperdoc);
- $ms = $this->timer->millisecs();
- $msper = number_format( ($ms / $done), 0);
- debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
- $last = $pct_int;
- }
- }
- } // while
- // Close tree list file..
- $treelist->closefile();
- // Wrap it up..
- $this->timer->stop();
- // Final stats if verbose mode..
- if (debugging()) {
- $secs = $this->timer->secs();
- $msper = number_format( (1000 * $secs / $todo), 2);
- $sper1000 = number_format( ($secs / $todo) * 1000, 2);
- debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
- debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
- debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
- debugbr("successfully indexed: $succeeded", DBG_DEBUG);
- debugbr("indexing failures: $failed", DBG_DEBUG);
- }
- }
- else {
- debugbr("nothing to index", DBG_DEBUG);
- }
- }
- else {
- debugbr("failed to open $tmpfname", DBG_DEBUG);
- }
- } // index_tree
- } // fileindexer class
- // ----------------------------------------------------------------------
- ?>
Documentation generated by phpDocumentor 1.3.0RC3