libept
0.5.25
|
00001 // -*- C++ -*- 00002 #include <xapian.h> 00003 #include <ept/core/apt.h> 00004 #include <wibble/regexp.h> 00005 #include <wibble/sys/pipe.h> 00006 #include <wibble/sys/exec.h> 00007 00008 #ifndef EPT_XAPIAN_H 00009 #define EPT_XAPIAN_H 00010 00011 namespace ept { 00012 namespace core { 00013 namespace xapian { 00014 00015 // Allocate value indexes for known values 00016 const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1; 00017 const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2; 00018 const Xapian::valueno VAL_POPCON = 10; 00019 const Xapian::valueno VAL_ITERATING_RATING = 20; 00020 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21; 00021 const Xapian::valueno VAL_ITERATING_USABILITY = 22; 00022 const Xapian::valueno VAL_ITERATING_SECURITY = 23; 00023 const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24; 00024 const Xapian::valueno VAL_ITERATING_QUALITY = 25; 00025 const Xapian::valueno VAL_ITERATING_SUPPORT = 26; 00026 const Xapian::valueno VAL_ITERATING_ADOPTION = 27; 00027 00028 struct TagFilter : public Xapian::ExpandDecider 00029 { 00030 virtual bool operator()(const std::string &term) const { 00031 return term[0] == 'X' && term[1] == 'T'; 00032 } 00033 }; 00034 00035 struct List { 00036 char m_enqPlace[sizeof(Xapian::Enquire)]; 00037 mutable Xapian::MSet m_matches; 00038 mutable Xapian::MSet::const_iterator m_iter; 00039 mutable int m_pos; 00040 typedef List Type; 00041 00042 static const size_t chunkSize = 20; 00043 00044 List head() const { 00045 seek(); 00046 return *this; 00047 } 00048 00049 Token token() const { 00050 Token t; 00051 t._id = m_iter.get_document().get_data(); 00052 return t; 00053 } 00054 00055 bool operator<( const List &o ) const { 00056 return token() < o.token(); 00057 } 00058 00059 void seek() const { 00060 if ( m_matches.size() == chunkSize && m_iter == m_matches.end() ) { 00061 m_matches = enq().get_mset( m_pos, chunkSize ); 00062 m_iter = m_matches.begin(); 00063 m_pos += chunkSize; 00064 } 00065 } 00066 00067 bool empty() const { 00068 if ( m_pos == -1 ) 00069 return true; 00070 seek(); 00071 return m_matches.size() < 30 && m_iter == m_matches.end(); 00072 } 00073 00074 List tail() const { 00075 List t = *this; 00076 t.seek(); 00077 t.m_iter ++; 00078 return t; 00079 } 00080 00081 Xapian::Enquire const &enq() const { 00082 return *reinterpret_cast< Xapian::Enquire const * >( m_enqPlace ); 00083 } 00084 00085 List( Xapian::Enquire _enq ) 00086 { 00087 Xapian::Enquire *e = new (m_enqPlace) Xapian::Enquire( _enq ); 00088 assert_eq( e, &enq() ); 00089 m_matches = enq().get_mset( 0, chunkSize ); 00090 m_iter = m_matches.begin(); 00091 m_pos = chunkSize; 00092 } 00093 00094 List() {} 00095 }; 00096 00097 struct Query { 00098 Xapian::Database *m_db; 00099 Xapian::Enquire m_enq; 00100 Xapian::Stem m_stem; 00101 typedef std::set< std::string > Terms; 00102 Terms m_include, m_exclude, m_secondary; 00103 int m_cutoff; 00104 bool m_expand; 00105 00106 void setQualityCutoff( int c ) { 00107 m_cutoff = c; 00108 } 00109 00110 void setExpand( bool e ) { m_expand = e; } 00111 00112 Query( Xapian::Database &e ) : m_db( &e ), m_enq( e ) { 00113 m_cutoff = 50; 00114 m_expand = true; 00115 } 00116 00117 wibble::Tokenizer queryTokenizer( std::string q ) const { 00118 return wibble::Tokenizer( q, "[A-Za-z0-9._+:-]+", REG_EXTENDED ); 00119 } 00120 00121 template< typename Out > 00122 void tokenizeQuery( std::string q, Out o ) const 00123 { 00124 wibble::Tokenizer tok = queryTokenizer( q ); 00125 for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i ) 00126 { 00127 if ( (*i).find( "::" ) != std::string::npos ) { // assume tag 00128 *o++ = ("XT" + *i); 00129 } else { 00130 std::string t = wibble::str::tolower(*i); 00131 std::string s = m_stem(t); 00132 *o++ = t; 00133 if (s != t) 00134 *o++ = ("Z" + s); 00135 } 00136 } 00137 } 00138 00139 template< typename Out > 00140 void expand( Out o ) const 00141 { 00142 Xapian::RSet rset; 00143 // Get the top 5 results as 'good ones' to compute the search expansion 00144 Xapian::MSet mset = m_enq.get_mset(0, 5); 00145 for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i ) 00146 rset.add_document(i); 00147 // Get the expanded set, only expanding the query with tag names 00148 TagFilter tagf; 00149 Xapian::ESet eset = m_enq.get_eset(5, rset, &tagf); 00150 for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i ) 00151 *o++ = *i; 00152 } 00153 00154 void updateEnquire() { 00155 // set up query now 00156 Xapian::Query inc( Xapian::Query::OP_OR, 00157 m_include.begin(), 00158 m_include.end() ), 00159 exc( Xapian::Query::OP_OR, 00160 m_exclude.begin(), 00161 m_exclude.end() ), 00162 secondary( Xapian::Query::OP_OR, 00163 m_secondary.begin(), 00164 m_secondary.end() ), 00165 secondary1( Xapian::Query::OP_SCALE_WEIGHT, secondary, 0.02 ), 00166 query1( Xapian::Query::OP_AND_NOT, inc, exc ), 00167 query( Xapian::Query::OP_OR, query1, secondary1 ); 00168 00169 m_enq.set_query( query ); 00170 00171 if ( m_expand ) { 00172 m_expand = false; 00173 expand( std::inserter( m_include, m_include.begin() ) ); 00174 updateEnquire(); 00175 m_expand = true; 00176 return; 00177 } 00178 00179 Xapian::MSet first = m_enq.get_mset(0, 1, 0, 0, 0); 00180 Xapian::MSetIterator ifirst = first.begin(); 00181 if ( ifirst != first.end() ) { 00182 // Xapian::percent cutoff = ifirst.get_percent() * m_cutoff / 100; 00183 // m_enq.set_cutoff(cutoff); 00184 } 00185 } 00186 00187 List results() { 00188 updateEnquire(); 00189 return List( m_enq ); 00190 } 00191 00192 std::map< std::string, int > relevantTags( int n = 30 ) { 00193 updateEnquire(); 00194 std::map< std::string, int > relev; 00195 Xapian::RSet rset; 00196 Xapian::MSet mset = m_enq.get_mset(0, 100); 00197 for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i ) 00198 rset.add_document(i); 00199 // Get the expanded set, only expanding the query with tag names 00200 TagFilter tagf; 00201 Xapian::ESet eset = m_enq.get_eset(n, rset, &tagf); 00202 for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i ) 00203 relev.insert( relev.begin(), 00204 std::make_pair( 00205 std::string( *i, 2, std::string::npos ), 00206 i.get_weight() ) ); 00207 return relev; 00208 } 00209 00210 void addTerms( std::string t, bool partial = false, bool exclude = false ) { 00211 if ( t.empty() ) 00212 return; 00213 Terms &to = exclude ? m_exclude : m_include; 00214 std::vector< std::string > tok; 00215 tokenizeQuery( t, std::back_inserter( tok ) ); 00216 if ( partial ) { 00217 if ( tok.back().size() == 1 ) { 00218 tok.pop_back(); 00219 } else { 00220 std::copy( 00221 m_db->allterms_begin( tok.back() ), 00222 m_db->allterms_end( tok.back() ), 00223 std::back_inserter( tok ) ); 00224 } 00225 } 00226 std::copy( tok.begin(), tok.end(), std::inserter( to, to.begin() ) ); 00227 } 00228 00229 void addTerms( const Terms &t, bool exclude = false ) { 00230 Terms &to = exclude ? m_exclude : m_include; 00231 std::copy( t.begin(), t.end(), std::inserter( to, to.begin() ) ); 00232 } 00233 00234 void addSecondaryTerm( const std::string &term, bool partial = false ) { 00235 if ( partial ) { 00236 std::copy( 00237 m_db->allterms_begin( term ), 00238 m_db->allterms_end( term ), 00239 std::inserter( m_secondary, m_secondary.begin() ) ); 00240 } else { 00241 m_include.insert( m_secondary.begin(), term ); 00242 } 00243 } 00244 00245 }; 00246 00247 struct Source 00248 { 00249 protected: 00250 mutable Xapian::Database m_db; 00251 Xapian::Stem m_stem; 00252 mutable bool m_opened; 00253 00255 static std::string toLower(const std::string& str); 00256 00263 void normalize_and_add(Xapian::Document& doc, const std::string& term, 00264 int& pos) const; 00265 00266 public: 00267 Source(); 00268 00270 Xapian::Database& db() { 00271 open(); 00272 return m_db; 00273 } 00274 00276 const Xapian::Database& db() const { 00277 open(); 00278 return m_db; 00279 } 00280 00281 void open() const; 00282 void invalidate() { 00283 m_db = Xapian::Database(); 00284 m_opened = false; 00285 } 00286 00288 time_t timestamp() const; 00289 00290 void updateLeniently( AptDatabase &apt, OpProgress *op = 0 ) { 00291 if (apt.timestamp() - timestamp() > 86400 * 8) // a little over a week 00292 update( op ); 00293 } 00294 00295 void update( OpProgress *op = 0 ) { 00296 if ( !op ) 00297 op = new OpProgress(); 00298 00299 wibble::exception::AddContext _ctx( "Rebuilding Xapian database." ); 00300 int outfd; 00301 std::string op_str; 00302 00303 wibble::sys::Exec ex( "update-apt-xapian-index" ); 00304 ex.args.push_back( "--batch-mode" ); 00305 ex.searchInPath = true; 00306 ex.forkAndRedirect( 0, &outfd, 0 ); 00307 00308 wibble::sys::Pipe monit( outfd ); 00309 while ( !monit.eof() ) { 00310 std::string line = monit.nextLine(); 00311 if ( line.empty() ) { 00312 usleep( 100000 ); 00313 continue; 00314 } 00315 std::cerr << "got : " << line << std::endl; 00316 if ( wibble::str::startsWith( line, "begin: " ) ) { 00317 op_str = std::string( line, 7, std::string::npos ); 00318 op->OverallProgress( 0, 100, 100, op_str ); 00319 00320 } else if ( wibble::str::startsWith( line, "done: " ) ) { 00321 op->Done(); 00322 } else if ( wibble::str::startsWith( line, "progress: " ) ) { 00323 wibble::ERegexp re( "progress: ([0-9]+)/([0-9]+)", 3 ); 00324 if ( re.match( line ) ) { 00325 assert_eq( re[2], "100" ); 00326 op->OverallProgress( atoi( re[1].c_str() ), 100, 100, op_str ); 00327 } 00328 } 00329 } 00330 ex.waitForSuccess(); 00331 invalidate(); 00332 } 00333 00335 bool hasData() const { return timestamp() > 0; } 00336 00337 Query query( const std::string &s, 00338 bool expand = true, 00339 int qualityCutoff = 50 ) 00340 { 00341 Query q( db() ); 00342 q.setQualityCutoff( qualityCutoff ); 00343 q.setExpand( expand ); 00344 q.addTerms( s ); 00345 if ( s.length() > 2 ) 00346 q.addSecondaryTerm( "XP" + s, true ); 00347 return q; 00348 } 00349 00350 Query partialQuery( const std::string &s ) { 00351 Query q( db() ); 00352 q.addTerms( s, true ); // partial 00353 return q; 00354 } 00355 00357 // bool needsRebuild(apt::Apt& apt); 00358 00359 Xapian::docid docidByName(const std::string& pkgname) const; 00360 00364 Xapian::Query makeORQuery(const std::string& keywords) const; 00365 00372 Xapian::Query makePartialORQuery(const std::string& keywords) const; 00373 00377 template<typename ITER> 00378 Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const 00379 { 00380 return Xapian::Query(Xapian::Query::OP_OR, begin, end); 00381 } 00382 00384 std::vector<std::string> expand(Xapian::Enquire& enq) const; 00385 00386 // std::vector<std::string> similar(const std::string& pkg); 00387 00391 Xapian::Query makeRelatedQuery(const std::string& pkgname) const; 00392 00396 double getDoubleValue(const std::string& pkgname, 00397 Xapian::valueno val_id) const; 00398 00402 int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const; 00403 }; 00404 00405 } 00406 } 00407 } 00408 00409 #endif