mercredi 8 février 2012

Lucene very good search engine

I- Hello Lucene Example:


package com.ictelecom.lucene.search.test;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;

public class HelloLucene
{
public static void main(String[] args) throws IOException, ParseException
{
// 0. Specify the analyzer for tokenizing text.
//    The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

// 1. create the index
Directory index = new RAMDirectory();

IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer);

IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action");
addDoc(w, "Lucene for Dummies");
addDoc(w, "Managing Gigabytes");
addDoc(w, "lucene test");
w.close();

// 2. query
String querystr = args.length > 0 ? args[0] : "lucene test";

// the "title" arg specifies the default field to use
// when no field is explicitly specified in the query.
Query q = new QueryParser(Version.LUCENE_35, "title", analyzer).parse(querystr);

// 3. search
int hitsPerPage = 20;
IndexSearcher searcher = new IndexSearcher(index, true);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;

// 4. display results
System.out.println("Found " + hits.length + " hits.");
for(int i=0;i
{
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("title"));
}

// searcher can only be closed when there
// is no need to access the documents any more.
searcher.close();
}

private static void addDoc(IndexWriter w, String value) throws IOException
{
Document doc = new Document();
doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));
w.addDocument(doc);
}
}


II- Sample Example (indexing):

This tutorial sketches a small Application. We want to index text Files and then search the files for words. This tutorial will give you an overview and is not intended to be a working application. The code given is not complete.
There are three classes:
  1. One for indexing the files
  2. One for converting File objects to indexable documents
  3. One for searching the index
We will start with the Indexer Class. Lucene uses a IndexWriter for indexing documents. To index an object, the file must be converted to a document with fields, which can be indexable. The FileIndexer class has a index method which gets a File object for indexing.

public class FileIndexer {
public static void index( File file ) { // 1. Convert indexed object to a document // 2. Write document to IndexWriter
// directory, where to store the index // files String indexFile = "/tmp/fileindex";
// to index documents, they are 'written' or added to // an IndexWriter IndexWriter writer = null; try { File f; boolean create = true; // create index if the directory does not exist if ((f = new File(indexFile)).exists() && f.isDirectory()) { create = false; } else { create = true; } writer = new IndexWriter(indexFile, new Analyzer(), create);
writer.mergeFactor = 20; // now add this document to the Index // we use an adapter class, which is given a // file and returns a document, which lucene // can index writer.addDocument(FileDocument.Document(file)); writer.optimize(); } catch(InterruptedException e) { throw new IndexException("Unable to index document.", e); } catch(IOException e) { throw new IndexException("Unable to index document.", e); } finally { close(writer); } }
// close writer public static void close(IndexWriter writer) { if(null != writer) { try { writer.close(); } catch(Exception e) { } } } }

The convertion class used in FileIndexer is easy. It converts the File object to a Lucene document. Usually you use a Class with a static method Document as a factory method.

public class FileDocument {
  // Lucene can only index objects of type
  // Document. So the to be indexed object
  // must be converted to this document,
  // usually with a static method
public static Document Document(File file) { // load content of file, get name, path and date // and store them into content, name, date ...
Document doc = new Document();
DateFormat df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.GERMANY);
// we want to index the fields content, name and date // we can then search for "house" everywhere or // "content:house" in content. // fields can be added that are not indexed but // can be acessed at search time. This is e.g. useful // for storing object primary keys etc. doc.add(Field.Text("content", content)); doc.add(Field.Text("path", path); doc.add(Field.Text("name", name); doc.add(Field.Text("date", df.format(message.getSentDate()))); return doc; } }



The FileSearcher class uses the index to find documents.
  1. Get a QueryString
  2. Get an IndexSearcher
  3. Parse the Query
  4. Give the parsed Query to the IndexSearcher 
  5. The IndexSearcher will return an Array with matching documents, sorted by hit quality


// Get the query String e.g. from the comand line
    ...
// Create a searcher from the index file String indexFile = "/tmp/fileindex"; Searcher searcher = null; try { searcher = new IndexSearcher(indexFile); } catch(IOException e) { System.out.println("Unable to open index file: " + indexFile); System.exit(1); }
// parse the query String. Query query = null; try { // If no prefix for a word is given, then search in // content by default query = QueryParser.parse(queryString, "content", new Analyzer()); } catch(ParseException e) { close(searcher); System.out.println("Unable to parse: " + queryString); System.exit(1); }
// get the hits from the searcher for // the given query Hits hits = null; try { hits = searcher.search(query); } catch(IOException e) { close(searcher); System.out.println("IO Error."); e.printStackTrace(); System.exit(1); }
// iterate over the results // the results are an array of document try { // display the first 10 results int start = 0; final int HITS_PER_PAGE = 10; int end = Math.min(HITS_PER_PAGE, hits.length());
if(hits.length() > 0) { PrintfFormat pf = new PrintfFormat("%-20s %-30s %-30s"); System.out.println(pf.sprintf(new Object[]{"Date","Subject","To"})); for(int i = start; i < end; i++) { System.out.println(pf.sprintf( new Object[] { // retrieve the indexed Fields from the result documents. // we could also get a persisten object key and load // the objects for further attributes to display hits.doc(i).get("path"), hits.doc(i).get("name"), hits.doc(i).get("date")})); } } else { System.out.println("No matching files found."); }