Lucene .NET 更新数据答案

【问题标题】：Lucene .NET Update dataLucene .NET 更新数据
【发布时间】：2014-09-29 07:05:20
【问题描述】：

我几天前开始使用 Lucene，但在调试我的解决方案时，我发现 Lucene 有一个问题。为了尝试解决这个问题，我创建了新的自定义项目并开始测试不同的解决方案，但是在使用 Lucene 搜索解决方案 2 天后，我放弃了......

我的问题：

我创建一个自定义类，创建该类的标准数组。创建 Document 对象并通过 IndexWriter 对其进行索引。它一切正常。搜索效果很好。但是，当我尝试使用 IndexWriter.UpdateDocument 更新任何文档并说它使用索引“5”更新文档时，它会创建 id = 5 的新文档。最后我有 2 个 id = 5 的文档和旧文档, 和新的。如果在替换 id 的 IndexWriter 'true' 的构造函数中，那么当我更新它时，它只保存 1 个更新的文档，并删除之前的所有索引。确切地说，我不能一直更新所有基础，因为我的基础很大（我的构造函数上大约有 600 个互联网资源），我只需要更新更改的数据（替换它新的），并保存之前的索引。可能有人知道我做错了什么？

附：对不起我的英语。

class mydoc
{
    public string id;
    public string name;
    public string content;

    public mydoc(string ID, string Name, string Content)
    {
        id = ID;
        name = Name;
        this.content = Content;
    }
}

class Program
{
    static void Main(string[] args)
    {
        Console.WriteLine("Create data array...");
        mydoc[] docs = new mydoc[11];
        docs[0] = new mydoc("0", "Name0", "tet 5");
        docs[1] = new mydoc("1", "Name1", "aaaa text");
        docs[2] = new mydoc("2", "Name2", "and me test ");
        docs[3] = new mydoc("3", "Name3", "I am new tes 3");
        docs[4] = new mydoc("4", "Name4", "I am new tes 4");
        docs[5] = new mydoc("5", "Name5", "I am new test 5");
        docs[6] = new mydoc("6", "Name6", "I am new text 6");
        docs[7] = new mydoc("7", "Name7", "I am new text 7");
        docs[8] = new mydoc("8", "Name8", "I am new text 8");
        docs[9] = new mydoc("9", "Name9", "I am new text 9");
        docs[10] = new mydoc("10", "Name10", "I am new test 10");

        Console.WriteLine("index processing...");
        var dir = new DirectoryInfo("tmp");
        FSDirectory fsdir = FSDirectory.Open(dir);
        Analyzer analyzer = new StandardAnalyzer(Net.Util.Version.LUCENE_29);
        IndexWriter writer = new IndexWriter(fsdir , analyzer,true, IndexWriter.MaxFieldLength.UNLIMITED);

        for (int i = 0; i < docs.Length; i++)
        {
            writer.AddDocument(Convert(docs[i]));
        }
        writer.Optimize(true);

        writer.Close(true);

        Console.WriteLine("index done !");

        IndexReader reader = IndexReader.Open(fsdir, true);
        for (int i = 0; i < reader.MaxDoc;i++)
        {
            Document doc = reader.Document(i);
            Console.WriteLine("id = \"{0}\", Name = \"{1}\", Context = \"{2}\"", doc.Get("ID"),doc.Get("Name"),doc.Get("Content"));
        }
        reader.Close();

        // Update custom base
        IndexWriter updater = new IndexWriter(fsdir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
        updater.UpdateDocument(new Term("0"), Convert(new mydoc("0", "New name 0", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("1"), Convert(new mydoc("1", "New name 1", "prosto obitr test")),new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("2"), Convert(new mydoc("2", "New name 2", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("3"), Convert(new mydoc("3", "New name 3", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("4"), Convert(new mydoc("4", "New name 4", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("5"), Convert(new mydoc("5", "New name 5", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("6"), Convert(new mydoc("6", "New name 6", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("7"), Convert(new mydoc("7", "New name 7", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("8"), Convert(new mydoc("8", "New name 8", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));
        updater.UpdateDocument(new Term("9"), Convert(new mydoc("9", "New name 9", "prosto obitr test")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));

        updater.Optimize();
        updater.Close(true);

        reader = IndexReader.Open(fsdir, true);
        Console.WriteLine("New updated data:");
        for (int i = 0; i < reader.MaxDoc; i++)
        {
            Document doc = reader.Document(i);
            Console.WriteLine("id = \"{0}\", Name = \"{1}\", Context = \"{2}\"", doc.Get("ID"), doc.Get("Name"), doc.Get("Content"));
        }

        Console.ReadKey();


        Console.WriteLine("search processing...");
        string query = "test";
        fsdir = FSDirectory.Open(dir);
        IndexSearcher searcher = new IndexSearcher(fsdir, true);
        Console.WriteLine("Searching phrase \"{0}\"", query);
        List<KeyValuePair<int, int>> results = find(query, searcher);

        searcher.Close();
        fsdir.Close();

        Console.WriteLine("Results:");
        for (int i = 0; i < results.Count; i++)
        {
            try
            {
                // Display founded id
                Console.WriteLine(results[i].Value);
            }
            catch (Exception ex)
            {
                continue;
            }
        }
        Console.WriteLine("\n\rDone !");
        Console.ReadKey();

    }

    static List<KeyValuePair<int,int>> find(string query, IndexSearcher searcher)
    {
        var parser = new MultiFieldQueryParser(Net.Util.Version.LUCENE_30, new[] { "Name", "Content" }, new SimpleAnalyzer());
        var score = searcher.Search(parser.Parse(query), 99).ScoreDocs;
        var docIDs = score.Select(x => new KeyValuePair<int, int>
            (
                x.Doc, int.Parse(searcher.Doc(x.Doc).Get("ID"))
            )
            ).ToList();
        return docIDs;
    }


    static Document Convert(mydoc doc)
    {
        var document = new Document();
        document.Add(new Field("ID", doc.id, Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.Add(new Field("Name", doc.name, Field.Store.YES, Field.Index.ANALYZED));
        document.Add(new Field("Content", doc.content, Field.Store.YES, Field.Index.ANALYZED));

        return document;
    }
}

在这种情况下 doc[10] 只是从索引中消失。如果在

IndexWriter updater = new IndexWriter(fsdir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);

'true' 替换为 'false'，它创建新文档而不是替换旧文档。

updater.commit() 也没有帮助。

【问题讨论】：

您可以尝试将您的程序减少到最低限度，以便其他人更容易帮助调试、阅读代码等吗？
我不知道我可以在那里减少什么，但我可以解释。首先，我使用自定义类 obkects (mydoc) 创建数组。通过将每个 mydoc 转换为 Document（Lucene 对象）并将其添加到索引并由 IndexWriter 对其进行索引后，将其添加到索引中。接下来我在 Lucene 索引中显示所有存在的文档。在我使用 IndexWriter updater.UpdateDocument 进行更改之后。为了查看 Lucene 索引中所做的更改，我再次显示索引中存储的所有文档。并且在文档中通过单词“test”进行搜索并显示结果文档ID后，在“Content”字段中有单词“test”。全部。
我不了解 IndexWriter，但我可以在您的代码中看到一些可能是问题的地方。 (1) 在您的更新程序中，您只有 10 个文档，而不是 11 个。 Doc[10] 不在列表中。 (2) 此外，您似乎创建了新文档而不是调用旧文档并更新它们。我可能错了。
在这种情况下，我只更新 10 个而不是 11 个，因为在 out 中我必须获取 11 个文档，因为首先索引了 11 个，更新了 10 个，并且 1 个文档必须是旧数据。在显示中我只更新了 10 个。这意味着在更新时会删除所有存储的数据并添加新数据。是的，它可以成功运行，但我不能每 10-30 分钟索引 100 Mb 数据。它的性能非常昂贵。

标签： c# lucene.net

【解决方案1】：

问题解决了。我的错在于对类型 Term in 的理解不正确：

IndexUpdater.UpdateDocument(Term term, Document doc);

它需要像这样创建新的 Term 实例（在我的情况下）：

updater.UpdateDocument(new Term("ID", "5"), Convert(new mydoc("5", "New name 5", "simple new test text")), new StandardAnalyzer(Net.Util.Version.LUCENE_30));

在 Term 构造函数字段中，“ID”是我的唯一字段，没有索引标志，“5”是索引中旧文档中旧值字段“ID”的文本。

【讨论】：