【问题标题】:Lucene search: indexing takes forever [closed]Lucene 搜索:索引需要永远 [关闭]
【发布时间】:2016-02-20 10:08:04
【问题描述】:

我在 Lucene 中的程序曾经在使用 RAMDirectory 时出现内存不足错误,所以我切换到 FSDirectory;然而,这一次索引需要永远(超过 5 小时)并且不会结束。我对Lucene很陌生,请我做错了什么。请帮我检查我的代码。 谢谢

这是我的代码:

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class RechercheEngine extends LireFichierCollection {
    static Path path = Paths.get("Ressource/index-directory");
    static ArrayList<String> resultat = new ArrayList<String>();
    final static String file = "Ressources/file_collection.txt";

    public static void indexerEtRechercherDocument(boolean exchange)
            throws IOException, org.apache.lucene.queryparser.classic.ParseException {

        System.out.println("reading documents...");
        LireFichierCollection readDocCollection = new LireFichierCollection();
        readDocCollection.readFile(file);
        System.out.println("Analyzing documents...");
        Analyzer analyzer = new StandardAnalyzer();
        Directory directory = FSDirectory.open(path);
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter indexWriter = new IndexWriter(directory, config);

        Document doc = new Document();

        System.out.println("Indexing documents...");
        for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
            String key = entry.getKey();
            String content = entry.getValue();
            // indexing the docKey and Content of each document
            doc.add(new StringField("DocKey", key, Field.Store.YES));
            if (exchange) {
                Stemming st = new Stemming();
                doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
            } else {
                doc.add(new TextField("DocContent", content, Field.Store.NO));
            }
            indexWriter.addDocument(doc);
        }
        indexWriter.close();
        System.out.println("Indexing documents done");
        // I am checking if all documents were indexed properly using tester.txt

        LireRequete readQueries = new LireRequete();
        readQueries.readList();
        for (Map.Entry<String, String> entry : readQueries.queries.entrySet()) {
            Stemming t = new Stemming();
            Query query;
            if (exchange) {
                query = new QueryParser("DocContent", analyzer).parse(t.stemmingAvecStopWord(entry.getValue()));
            } else {
                query = new QueryParser("DocContent", analyzer).parse(entry.getValue());
            }
            System.out.println("Researching documents...");

            IndexReader reader = DirectoryReader.open(directory);
            IndexSearcher searcher = new IndexSearcher(reader);

            ScoreDoc[] hits = searcher.search(query, 2).scoreDocs;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document hitDoc = searcher.doc(docId);
                System.out.println(hitDoc.get("DocKey"));
                resultat.add(entry.getKey() + " " + hitDoc.get("DocKey") + " " + hits[i].score);
            }
            reader.close();
        }
    }

}

【问题讨论】:

    标签: java indexing lucene


    【解决方案1】:

    您忘记清除文档字段了吗?

    Document doc = new Document();
    System.out.println("Indexing documents...");
    for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
        doc.clear(); //did you forget this line?
        String key = entry.getKey();
        String content = entry.getValue();
        // indexing the docKey and Content of each document
        doc.add(new StringField("DocKey", key, Field.Store.YES));
        if (exchange) {
            Stemming st = new Stemming();
            doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
        } else {
            doc.add(new TextField("DocContent", content, Field.Store.NO));
        }
        indexWriter.addDocument(doc);
    }
    indexWriter.close();
    

    我已经添加了doc.clear(); 所以每次都会清除。现在,如果没有这条线,您的文档应该会以指数方式变得越来越大。

    希望这可以解决。

    PS: 或者在循环中创建新文档:

    System.out.println("Indexing documents...");
    for (Map.Entry<String, String> entry : readDocCollection.docCollection.entrySet()) {
        Document doc = new Document();
        String key = entry.getKey();
        String content = entry.getValue();
        // indexing the docKey and Content of each document
        doc.add(new StringField("DocKey", key, Field.Store.YES));
        if (exchange) {
            Stemming st = new Stemming();
            doc.add(new TextField("DocContent", st.stemmingAvecStopWord(content), Field.Store.NO));
        } else {
            doc.add(new TextField("DocContent", content, Field.Store.NO));
        }
        indexWriter.addDocument(doc);
    }
    indexWriter.close();
    

    【讨论】:

    • 您好 Elbek 感谢您尝试帮助我。但是,当我添加此位时,我收到一条错误消息:方法 clear() 未定义为 Document 类型。
    • 然后在for循环中创建一个doc(Document doc = new Document();),不要重复使用这个doc
    • @CarmenK 我已经更新了我的答案。
    • 天哪,非常感谢。它现在工作。你让我开心:)
    猜你喜欢
    • 2013-04-06
    • 2017-11-29
    • 2012-10-22
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多