JAMEL ESSOUSSI: février 2012

I- Hello Lucene Example:

package com.ictelecom.lucene.search.test;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.RAMDirectory;

import org.apache.lucene.util.Version;

import java.io.IOException;

public class HelloLucene

{

	public static void main(String[] args) throws IOException, ParseException

	{

		// 0. Specify the analyzer for tokenizing text.

		//    The same analyzer should be used for indexing and searching

		StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);

		// 1. create the index

		Directory index = new RAMDirectory();

		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer);

		IndexWriter w = new IndexWriter(index, config);

		addDoc(w, "Lucene in Action");

		addDoc(w, "Lucene for Dummies");

		addDoc(w, "Managing Gigabytes");

		addDoc(w, "lucene test");

		w.close();

		// 2. query

		String querystr = args.length > 0 ? args[0] : "lucene test";

		// the "title" arg specifies the default field to use

		// when no field is explicitly specified in the query.

		Query q = new QueryParser(Version.LUCENE_35, "title", analyzer).parse(querystr);

		// 3. search

		int hitsPerPage = 20;

		IndexSearcher searcher = new IndexSearcher(index, true);

		TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

		searcher.search(q, collector);

		ScoreDoc[] hits = collector.topDocs().scoreDocs;

		// 4. display results

		System.out.println("Found " + hits.length + " hits.");

		for(int i=0;i

		{

			int docId = hits[i].doc;

			Document d = searcher.doc(docId);

			System.out.println((i + 1) + ". " + d.get("title"));

		}

		// searcher can only be closed when there

		// is no need to access the documents any more.

		searcher.close();

	}

	private static void addDoc(IndexWriter w, String value) throws IOException

	{

		Document doc = new Document();

		doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));

		w.addDocument(doc);

	}

}

II- Sample Example (indexing):

This tutorial sketches a small Application. We want to index text Files and then search the files for words. This tutorial will give you an overview and is not intended to be a working application. The code given is not complete.

There are three classes:

One for indexing the files
One for converting File objects to indexable documents
One for searching the index

We will start with the Indexer Class. Lucene uses a IndexWriter for indexing documents. To index an object, the file must be converted to a document with fields, which can be indexable. The FileIndexer class has a index method which gets a File object for indexing.

public class FileIndexer {
public static void index( File file ) {
    // 1. Convert indexed object to a document
    // 2. Write document to IndexWriter

// directory, where to store the index
    // files
    String indexFile = "/tmp/fileindex";

// to index documents, they are 'written' or added to
    // an IndexWriter
    IndexWriter writer = null;
    try {
      File f;
      boolean create = true;
      // create index if the directory does not exist
      if ((f = new File(indexFile)).exists() && f.isDirectory()) {
        create = false;
      } else {
        create = true;
      }
      writer = new IndexWriter(indexFile, new Analyzer(), create);

writer.mergeFactor = 20;
      // now add this document to the Index
      // we use an adapter class, which is given a
      // file and returns a document, which lucene
      // can index
      writer.addDocument(FileDocument.Document(file));
      writer.optimize();
    } catch(InterruptedException e) {
     throw new IndexException("Unable to index document.", e);
   } catch(IOException e) {
     throw new IndexException("Unable to index document.", e);
   } finally {
     close(writer);
   }
  }

// close writer
  public static void close(IndexWriter writer) {
    if(null != writer) {
      try {
        writer.close();
      } catch(Exception e) {
      }
    }
  }
}

The convertion class used in FileIndexer is easy. It converts the File object to a Lucene document. Usually you use a Class with a static method Document as a factory method.

public class FileDocument {
  // Lucene can only index objects of type
  // Document. So the to be indexed object
  // must be converted to this document,
  // usually with a static method
public static Document Document(File file) {
    // load content of file, get name, path and date
    // and store them into content, name, date 
    ...

Document doc = new Document();

DateFormat df = DateFormat.getDateTimeInstance(DateFormat.SHORT,
      DateFormat.SHORT, Locale.GERMANY);

// we want to index the fields content, name and date
    // we can then search for "house" everywhere or
    // "content:house" in content.
    // fields can be added that are not indexed but
    // can be acessed at search time. This is e.g. useful
    // for storing object primary keys etc.
    doc.add(Field.Text("content", content));
    doc.add(Field.Text("path", path);
    doc.add(Field.Text("name", name);
    doc.add(Field.Text("date", df.format(message.getSentDate())));
    return doc;
  }
}

The FileSearcher class uses the index to find documents.

Get a QueryString
Get an IndexSearcher
Parse the Query
Give the parsed Query to the IndexSearcher
The IndexSearcher will return an Array with matching documents, sorted by hit quality

// Get the query String e.g. from the comand line
    ...
// Create a searcher from the index file
    String indexFile = "/tmp/fileindex";
    Searcher searcher = null;
    try {
      searcher = new IndexSearcher(indexFile);
    } catch(IOException e) {
      System.out.println("Unable to open index file: " + indexFile);
      System.exit(1);
    }

// parse the query String.
    Query query = null;
    try {
      // If no prefix for a word is given, then search in 
      // content by default
      query = QueryParser.parse(queryString, "content", new Analyzer());
    } catch(ParseException e) {
      close(searcher);
      System.out.println("Unable to parse: " + queryString);
      System.exit(1);
    }

// get the hits from the searcher for
    // the given query
    Hits hits = null;
    try {
      hits = searcher.search(query);
    } catch(IOException e) {
      close(searcher);
      System.out.println("IO Error.");
      e.printStackTrace();
      System.exit(1);
    }

// iterate over the results
    // the results are an array of document
    try {
      // display the first 10 results
      int start = 0;
      final int HITS_PER_PAGE = 10;
      int end = Math.min(HITS_PER_PAGE, hits.length());

if(hits.length() > 0) {
        PrintfFormat pf = new PrintfFormat("%-20s %-30s %-30s");
        System.out.println(pf.sprintf(new Object[]{"Date","Subject","To"}));
        for(int i = start; i < end; i++) {
          System.out.println(pf.sprintf(
            new Object[] { 
           // retrieve the indexed Fields from the result documents.
           // we could also get a persisten object key and load
           // the objects for further attributes to display
           hits.doc(i).get("path"),
           hits.doc(i).get("name"),
           hits.doc(i).get("date")}));
        }
      } else {
        System.out.println("No matching files found.");
      }

JAMEL ESSOUSSI

mercredi 8 février 2012

Lucene very good search engine

Qui êtes-vous ?

Archives du blog