Fulltext searches with Xapian and PHP

Tags: web-development, php, xapian, mysql

Полнотекстовый поиск с Xapian и PHP Полнотекстовый поиск с Xapian и PHP

Sometimes MySQL just isn't quick enough. Especially when it comes to fulltext searches. Everything needs to be indexed correctly, and if we're using different fields with different weights for a relevance percentage, things get very complicated quickly. Xapian to the rescue.

What is Xapian?
Xapian is a Search Engine Library, similar to Lucene and Sphinx. It's compiled from C++ code and therefore pretty low level. There are PHP, Perl and Python bindings available for it, which are straight forward to use. Packages are available for Ubuntu and Red Hat, it compiles on OSX and you can run it on Windows via CygWin.

Demo Scripts
Rather than explaining why and how, I decided to create some demo scripts instead.

db.sql
Hide code highlighting
1
2
3
4
5
6
7
8
9
10
11
12
13
14
CREATE DATABASE `demo`;
 
CREATE TABLE `demo`.`demo` (
`id` INT( 10 ) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY ,
`unique_key` VARCHAR( 255 ) NOT NULL ,
`name` VARCHAR( 255 ) NULL DEFAULT NULL ,
`summary` TEXT NULL DEFAULT NULL ,
`date` DATETIME NULL DEFAULT NULL ,
UNIQUE (`unique_key`));
 
INSERT INTO `demo`.`demo` 
(`id`, `unique_key`, `name`, `summary`, `date`)
VALUES (NULL, 'foo', 'foo', 'foo bar test', '2008-11-05 00:00:00'), 
(NULL , 'bar', 'bar', 'test foo bar', '2009-11-05 00:00:00');

XapianWrapper.php
Hide code highlighting
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
<?php
// includes
require_once 'xapian.php';
 
// main class
class XapianWrapper {
  const XAPIAN_FIELD_URL = 0;
  const XAPIAN_FIELD_NAME = 1;
  const XAPIAN_FIELD_DATE = 2;
  const XAPIAN_FIELD_UID = 3;
  const XAPIAN_FIELD_SUMMARY = 4;
  const XAPIAN_PREFIX_UID = "UID:";
  
  const SETTINGS_XAPIAN_DB = './xapian_db';
 
  const SETTINGS_MYSQL_HOST = 'localhost';
  const SETTINGS_MYSQL_USER = 'root';
  const SETTINGS_MYSQL_PASS = 'root';
  const SETTINGS_MYSQL_DB = 'demo';
  const SETTINGS_MYSQL_TABLE = 'demo';
 
  const DEFAULT_COUNT = 10;
 
  private $mysql_link;
  private $category_cache;
  
  private $xapian_read_db;
  private $xapian_write_db;
  private $xapian_stemmer;
  private $xapian_indexer;
  private $xapian_enquire;
 
  private function xapian_init_readonly() {
    try{
      $this->xapian_read_db = new XapianDatabase(self::SETTINGS_XAPIAN_DB);
      $this->xapian_stemmer = new XapianStem("english");
      $this->xapian_enquire = new XapianEnquire($this->xapian_read_db);
    } catch(Exception $e) {
      throw new Exception('Could initialize Xapian: ' . $e->getMessage());
    } 
  }
  
  private function xapian_init_writable() {
    try{
      $this->xapian_write_db = new XapianWritableDatabase(
self::SETTINGS_XAPIAN_DB, Xapian::DB_CREATE_OR_OPEN);  
      $this->xapian_indexer = new XapianTermGenerator();
      $this->xapian_stemmer = new XapianStem("english");
      $this->xapian_indexer->set_stemmer($this->xapian_stemmer);
    } catch(Exception $e) {
      throw new Exception('Could initialize Xapian: ' . $e->getMessage());
    } 
  }
  
  private function mysql_init() {
    $this->mysql_link = mysql_connect(
self::SETTINGS_MYSQL_HOST, self::SETTINGS_MYSQL_USER, self::SETTINGS_MYSQL_PASS);
    if (!$this->mysql_link) {
      throw new Exception('Could not connect: ' . mysql_error());
    }
 
    $db_selected = mysql_select_db(self::SETTINGS_MYSQL_DB, $this->mysql_link);
    if (!$db_selected) {
      throw new Exception('Can\'t use db : ' . mysql_error());
    }
  }
  
  /**
   * Index method
   *
   */
  public function index($params) {
    $this->xapian_init_writable();
    $this->mysql_init();
    
    $start = microtime(true);
 
    $response = new stdClass();
    $response->indexed = array();
 
    $offset = (isset($params['offset'])) ? intval($params['offset']) : 0;
    $count = (isset($params['count'])) ? intval($params['count']) : self::DEFAULT_COUNT;
    $sql = 'SELECT * FROM '.
self::SETTINGS_MYSQL_TABLE.' LIMIT ' . $offset . ', ' . $count . ';';
 
    $result = mysql_query($sql);
 
    if (!$result) {
      throw new Exception('Invalid query: ' . mysql_error());
    }
    
    $this->xapian_write_db->begin_transaction();
 
    while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
       $response->indexed[] = $this->index_row($row);      
    }
 
    $this->xapian_write_db->commit_transaction();
    mysql_free_result($result);
    mysql_close($this->mysql_link);
 
    return $response;
  }
    
 
  private function index_row($row) {
    $doc = new XapianDocument();
 
    $this->xapian_indexer->set_document($doc);
    $this->xapian_indexer->index_text($row['name'],50);
    $this->xapian_indexer->index_text($row['summary'], 1);
 
    $GUID = self::XAPIAN_PREFIX_UID . $row['unique_key'];
    $doc->add_term($GUID);
 
    $doc->add_value(self::XAPIAN_FIELD_URL, $row['url']);
    $doc->add_value(self::XAPIAN_FIELD_DATE, date('Ymd', strtotime($row['date'])));
    $doc->add_value(self::XAPIAN_FIELD_UID, $row['unique_key']);
    $doc->add_value(self::XAPIAN_FIELD_NAME, $row['name']);
    $doc->add_value(self::XAPIAN_FIELD_SUMMARY, $row['summary']);
    
    $this->xapian_write_db->replace_document(strval($GUID), $doc);
 
    $row_response = array();
    $row_response['name'] = $row['name'];
    $row_response['guid'] = $row['unique_key'];
    $row_response['url'] = $row['url'];
    return $row_response; 
  }
  
  /**
   * Delete method
   *
   */
  public function delete($params) {
    $this->xapian_init_writable();
 
    $this->xapian_write_db->begin_transaction();
 
    $response = array();
 
    foreach($params['items'] as $param_guid) {      
      $GUID = self::XAPIAN_PREFIX_UID . $param_guid;
      $this->xapian_write_db->delete_document(strval($GUID));
      $response[] = $param_guid;
    }
    
    $this->xapian_write_db->commit_transaction();
    return $response;
  }
 
  /**
   * Search method
   *
   */
  public function search($params) {
    $this->xapian_init_readonly();
 
    $start = microtime(true);
 
    // queries array to later construct full query
    $arr_queries = array();
 
    // from date
    if(!empty($params['date_from'])) {
      $arr_queries[] = new XapianQuery(
XapianQuery::OP_VALUE_GE, 6, date('Ymd', strtotime($params['date_from'])));
    }
 
    // to date
    if(!empty($params['date_to'])) {
      $arr_queries[] = new XapianQuery(
XapianQuery::OP_VALUE_LE, 6, date('Ymd', strtotime($params['date_to'])));
    }
 
    // unique key
    if(!empty($params['unique_key'])) {
      $arr_queries[] = new XapianQuery(self::XAPIAN_PREFIX_UID . $params['unique_key']);
    }
 
    // normal search query parsed
    if(!empty($params['search'])) {
      $qp = new XapianQueryParser();
      $qp->set_stemmer($this->xapian_stemmer);
      $qp->set_database($this->xapian_read_db);
      $qp->set_stemming_strategy(XapianQueryParser::STEM_SOME);
      $arr_queries[] = $qp->parse_query($params['search']);
    }
 
    // Find the results for the query.
        // construct final query
    $query = array_pop($arr_queries);
 
    foreach($arr_queries as $sq) {
      $query = new XapianQuery(XapianQuery::OP_AND, $query, $sq);
    }    
    $this->xapian_enquire->set_query($query);
  
    // set the count to the specified params
    $offset = (isset($params['offset'])) ? intval($params['offset']) : 0;
    $count = (isset($params['count'])) ? intval($params['count']) : self::DEFAULT_COUNT;
    $matches = $this->xapian_enquire->get_mset($offset, $count);
 
    $response = new stdClass();
    $response->result_count = $matches->get_matches_estimated();
    $results = array();
 
    $i = $matches->begin();
    while (!$i->equals($matches->end())) {
      $m = array();
 
      $n = $i->get_rank() + 1;
      $doc = $i->get_document();
 
      $m['position'] = $n;
      $m['url'] = $doc->get_value(self::XAPIAN_FIELD_URL);
      $m['name'] = $doc->get_value(self::XAPIAN_FIELD_NAME);
      $m['summary'] = $doc->get_value(self::XAPIAN_FIELD_SUMMARY);
      $m['date'] = $doc->get_value(self::XAPIAN_FIELD_DATE);
      $m['unique_key'] = $doc->get_value(self::XAPIAN_FIELD_UID);
      $m['percent'] = $i->get_percent();
 
      $results[count($results)] = $m;
      $i->next();
    }
 
    $response->results = $results;
    $end = microtime(true);
    
    // runtime info
    $response->execute = new stdClass();
    $response->execute->call = 'search';
    $response->execute->offset = $offset;
    $response->execute->count = $count;
    $response->execute->start = $start;
    $response->execute->end = $end;
    $response->execute->time = $end - $start;
 
    // debug stuff
    $response->execute->debug = $query->get_description();
 
    return $response;
  }
}

index.php
Hide code highlighting
1
2
3
4
5
6
7
<?php
 
require_once 'XapianWrapper.php';
 
$x = new XapianWrapper();
$res = $x->index(array());
print_r($res);

search.php
Hide code highlighting
1
2
3
4
5
6
7
8
<?php
 
require_once 'XapianWrapper.php';
 
$x = new XapianWrapper();
$params = array('search' => 'foo');
$res = $x->search($params);
print_r($res);

delete.php
Hide code highlighting
1
2
3
4
5
6
7
8
9
<?php
 
require_once 'XapianWrapper.php';
$x = new XapianWrapper();
$params = array(
        'items' => array('foo'),
);
$res = $x->delete($params);
print_r($res);

Using the example
Now we should run the php examples via command line.
Hide code highlighting
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
bash$ php index.php 
stdClass Object
(
    [indexed] => Array
        (
            [0] => Array
                (
                    [name] => foo
                    [guid] => foo
                    [url] => 
                )
 
            [1] => Array
                (
                    [name] => bar
                    [guid] => bar
                    [url] => 
                )
 
        )
 
)
bash$ php search.php 
stdClass Object
(
    [result_count] => 2
    [results] => Array
        (
            [0] => Array
                (
                    [position] => 1
                    [url] => 
                    [name] => foo
                    [summary] => foo bar test
                    [date] => 20081105
                    [unique_key] => foo
                    [percent] => 100
                )
 
            [1] => Array
                (
                    [position] => 2
                    [url] => 
                    [name] => bar
                    [summary] => test foo bar
                    [date] => 20091105
                    [unique_key] => bar
                    [percent] => 50
                )
 
        )
 
    [execute] => stdClass Object
        (
            [call] => search
            [offset] => 0
            [count] => 10
            [start] => 1256674866.79
            [end] => 1256674866.79
            [time] => 0.000944852828979
            [debug] => Xapian::Query(Zfoo:(pos=1))
        )
 
)
bash$ php delete.php 
Array
(
    [0] => foo
)
bash$ php search.php 
stdClass Object
(
    [result_count] => 1
    [results] => Array
        (
            [0] => Array
                (
                    [position] => 1
                    [url] => 
                    [name] => bar
                    [summary] => test foo bar
                    [date] => 20091105
                    [unique_key] => bar
                    [percent] => 100
                )
 
        )
 
    [execute] => stdClass Object
        (
            [call] => search
            [offset] => 0
            [count] => 10
            [start] => 1256674876.02
            [end] => 1256674876.02
            [time] => 0.000872850418091
            [debug] => Xapian::Query(Zfoo:(pos=1))
        )
 
)

Right, I leave it up to you to amend the examples to suit your individual needs. As always any feedback or improvements are welcome.

Original: Fulltext searches with Xapian and PHP

Rating: 12345   << Please, rate this article


Related articles:
   6 Tools To Be An Effective Web Developer
   Understanding scope in object oriented JavaScript
   An Introduction to the Art of Unit Testing in PHP
   Integrating FCKeditor with Zend_Form
   Automated Testing Using Zend Framework


 
 

Leave a comment:

Name


E-mail


Message