试图让java读取用solr创建的lucene索引答案

【问题标题】：Trying to get java to read lucene index created with solr试图让java读取用solr创建的lucene索引
【发布时间】：2012-12-27 03:56:07
【问题描述】：

我有一个用 solr 创建的 lucene 索引。 lucene 版本是 3.6.1。

我在网上找到了一个读取lucene索引的java程序：

http://www.javacodegeeks.com/2010/05/introduction-to-apache-lucene-for-full.html

我为我的本地环境修改了程序，但它总是告诉我没有找到在索引中有结果的查询。在对程序没有运气之后，我修改了代码以使用 StandardAnalyzer 而不是 SimpleAnalyzer。没有运气。

代码如下：

package com.javacodegeeks.lucene;

import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class StandardSearcher {

    public static void main(String[] args) throws Exception {

        File indexDir = new File("/path/to/solr/data/index/");
        String query = "science";
        int hits = 100;

        StandardSearcher searcher = new StandardSearcher();
        searcher.searchIndex(indexDir, query, hits);

    }

    private void searchIndex(File indexDir, String queryStr, int maxHits)
        throws Exception {

        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

        Directory directory = FSDirectory.open(indexDir);

        IndexSearcher searcher = new IndexSearcher(directory);
        Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(queryStr);

        TopDocs topDocs = searcher.search(query, maxHits);

        ScoreDoc[] hits = topDocs.scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println(d.get("filename"));
        }

        System.out.println("Found " + hits.length);

    }

}

我做错了什么？浏览 solrconfig.xml 我无法分辨 solr 默认使用哪个分析器。这就是我尝试 SimpleAnalyzer 和 StandardAnalyzer 的原因。

我们将不胜感激有关如何调试此问题的建议。

更新：这是我的架构中的字段：

<field name="metaDataUrl" type="string" indexed="true" stored="true" required="true"/>
<field name="title" type="text" stored="true" indexed="true"/>
<field name="snippet" type="text" indexed="true" stored="true"/>
<field name="rest" type="string" stored="true" indexed="false" multiValued="true"/>
<field name="date_indexed" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="all" type="text" stored="false" indexed="true" multiValued="true"/>

还有，这里是 schema.xml 中 fieldType 文本的 XML：

<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of                                                                                                             
    words on case-change, alpha numeric boundaries, and non-alphanumeric chars,                                                                                                                 
    so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".                                                                                                             
    Synonyms and stopwords are customized by external files, and stemming is enabled.                                                                                                           
    -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <!-- in this example, we will only use synonyms at query time                                                                                                                               
    <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>                                                                                  
    -->
    <!-- Case insensitive stop word removal.                                                                                                                                                    
      add enablePositionIncrements=true in both the index and query                                                                                                                             
      analyzers to leave a 'gap' for more accurate phrase queries.                                                                                                                              
    -->
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="stopwords.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="stopwords.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
  </analyzer>
</fieldType>

【问题讨论】：

你有 schema.xml 吗？如果有，请发布。
确认您使用相同的分析器进行索引和查询
@naresh 我如何知道用于索引的分析器是什么？
@DiegoBasch 我已更新帖子以在我的架构中包含 6 个字段。 schema.xml 超过 400 行。有什么特别想看的部分吗？没有引用与我的字段相关的任何分析器，这是我困惑的一部分。
能不能也贴一下type“text”对应的fieldType xml？

标签： solr lucene

【解决方案1】：

您需要使用索引时使用的标记器和过滤器构建自定义分析器（如 fieldType xml 的索引部分中所定义）。将该自定义分析器作为参数传递给搜索器，然后搜索应该可以正常工作。 SnowballPorterFilter 是否源于“科学”？可能是..

有关构建自定义分析器的详细信息，请参阅http://whiteboardjunkie.wordpress.com/tag/custom-analyzer/。您只需要在 tokenstream() 中一个接一个地调用过滤器

此外，您可以使用 luke (http://code.google.com/p/luke/) 检查索引，看看是否有任何文档在标题字段中包含“科学”。

【讨论】：

感谢对分析器链的教育。我认为我的解决方案更简单，尽管它可能最终需要更多分析器。

【解决方案2】：

一位同事稍微更改了我的代码，使其看起来像下面的代码。他还建议我搜索词干。这种方法有效，我现在从针对 solr 构建的 Lucene 索引的搜索中获得结果。此代码仍需改进，但我将其作为概念验证发布，希望对其他人有用。

import java.io.File;

import java.util.List;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class SimpleSearcher {

    public static void main(String[] args) throws Exception {

        File indexDir = new File("/path/to/solr/data/index/");
        int hits = 100;

        SimpleSearcher searcher = new SimpleSearcher();
        searcher.searchIndex(indexDir, args[0], hits);
    }

    private void searchIndex(File indexDir, String queryStr, int maxHits)
            throws Exception {

        Directory directory = FSDirectory.open(indexDir);

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_35,
         "title", new SimpleAnalyzer());
        Query query = parser.parse(queryStr);

        TopDocs topDocs = searcher.search(query, maxHits);

        ScoreDoc[] hits = topDocs.scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            List<Fieldable> fields = d.getFields();

            System.out.println( (i+1) + ". ==========================================================");
            for ( Fieldable field : fields ) {
               if (field.isStored()) {
                 System.out.println(" >> " + field.name() + " - " + d.get(field.name()));
               }
            }
        }

        System.out.println("Found " + hits.length);
    }
}

【讨论】：