【问题标题】:How to set Index Term length in Lucene.Net如何在 Lucene.Net 中设置索引词长度
【发布时间】:2012-02-10 03:17:52
【问题描述】:

如何限制 Lucene.Net 仅索引长度大于 x 的术语。 我将文档索引为:

        String indexDirectory = @"C:\Users\user\Desktop\Index";
        String dataDirectory = @"C:\Users\user\Desktop\Data";


        StandardAnalyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDirectory, analyzer);

        Document doc = new Document();

        Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO);
        Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);

        doc.Add(fPath);
        doc.Add(fContent);

我正在使用以下代码从 Lucene 索引文件中获取索引术语。

        TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0);

        foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
        {
            String[] terms = vector.GetTerms();

            foreach (String term in terms)
            {
                // loop through indexed terms
            }

        }

【问题讨论】:

    标签: lucene.net lucene


    【解决方案1】:

    您可以实现自己的分析器,或扩展 StandardAnalyzer。

    例子:

    TokenFilter + 分析器

    public class MinTermLengthTokenFilter : TokenFilter
    {
        private int minTermLength;
        private TermAttribute termAtt;
        public MinTermLengthTokenFilter(int maxTermLength, TokenStream input)
            : base(input)
        {
            this.minTermLength = maxTermLength;
            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
        }
    
        public override bool IncrementToken()
        {
            while (input.IncrementToken())
            {
                if (termAtt.TermLength() >= minTermLength)
                {
                    return true;
                }
            }
            return false;
        }        
    
    }
    
    
    public class MinTermLengthAnalyzer : StandardAnalyzer
    {
        private int minTermLength;
        public MinTermLengthAnalyzer(int minTermLength)
            :base()
        {
            this.minTermLength = minTermLength;
        }
    
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {   
            return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader));
        }
    
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader));
    
        }
    }
    

    索引:

     FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST");
     IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5));
     Document document = new Document();
    
     document.Add(new Field(
         "text",
         "some sample text for demonstration",
         Field.Store.YES,
         Field.Index.ANALYZED,
         Field.TermVector.WITH_POSITIONS_OFFSETS));
     writer.AddDocument(document);       
     writer.Close();
    

    搜索:

            var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST"));
    
            var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25);
    
            foreach (var result in results.ScoreDocs)
            {
                TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc);
    
                foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
                {
                    String[] terms = vector.GetTerms();
    
                    foreach (String term in terms)
                    {
                        Console.WriteLine(term);
                    }
    
                }
            }
    
            indexSearcher.Close();
            // outputs:
            // demonstration
            // sample
    

    【讨论】:

      猜你喜欢
      • 2013-03-18
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多