【问题标题】:Obtaining Lucene term vectors for a found term in a string获取字符串中找到的术语的 Lucene 术语向量
【发布时间】:2018-11-17 03:23:20
【问题描述】:

我正在尝试突出显示字符串中的术语。我的代码沿字符串搜索并在索引中查找等效项。代码返回找到的条款没问题。但是,我想将用户输入的原始字符串返回给用户,并突出显示找到的术语。我正在使用 Lucene 4,因为那是我用来学习 Lucene 的书。我有一个可怜的尝试来获取术语向量等,但它会遍历整个领域,我无法弄清楚如何获取找到的术语。这是我的代码:

public class TokenArrayTest {
private static final String INDEX_DIR = "C:/ontologies/Lucene/icnpIndex";
//private static  List<Float> levScore = new ArrayList<Float>();
//add key and value pairs of tokens to a map to send to a servlet. key 10,11,12 etc
    //private static HashMap<Integer, String> hashMap = new HashMap<Integer, String>();
private static List<String> tokens = new ArrayList<String>();

private static int totalResults=0;

public static void main(String[] pArgs) throws IOException, ParseException, InvalidTokenOffsetsException 
{   

    //counters which detect found term changes to advance the html table to the next cell
    int b=1;
    int c=1;
    String searchText="Mrs. smith has limited mobility and fell out of bed. She needs a feeding assessment. She complained of abdominal pains nuring the night. She woke with a headache and she is due for a shower this morning."; 

    //Get directory reference
    Directory dir = FSDirectory.open(new File(INDEX_DIR));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    //Create lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);

    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

    TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(searchText));
    CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);

    //Query parser to be used for creating TermQuery
    QueryParser qp = new QueryParser(Version.LUCENE_40, "Preferred Term", analyzer);


   /*add all of the words to an array after they have passed through the analyzer.
    * The words are used one by one through the query method later on.
    */
    while (tokenStream.incrementToken()) { 
        tokens.add(termAttribute.toString());         
    }       

        //print the top half of the html page       
    System.out.print("<html>\r\n" + 
            "\r\n" + 
            "<head>\r\n" + 
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\">\r\n" + 
            "\r\n" + 
            "<title>ICNP results</title>\r\n" + 
            "</head>\r\n" + 
            "\r\n" + 
            "<body>\r\n" + 
            "\r\n" + 
            "<p>"+
            searchText+"<br>"+
            "<p>"+
            "<div align=\"center\">\r\n" + 
            "  <center>\r\n" + 
            "  <table border=\"1\" \r\n" + 
            "    <tr>\r\n" +
            "<td>\r\n"+

            "");


    //place each word from the previous array into the query       
    for(int n=0;n<tokens.size();++n) {

    //Create the query
    Query query = qp.parse(tokens.get(n));

    //Search the lucene documents for the hits
    TopDocs hits = searcher.search(query, 20);

  //Total found documents
    totalResults =totalResults+hits.totalHits;



    //print out the score for each searched term
    //for (ScoreDoc sd : hits.scoreDocs)
    //{
       //Document d = searcher.doc(sd.doc);

       // System.out.println("Score : " + sd.score);


   // }


    /** Highlighter Code Start ****/

    //Put a html code in here for each found term if need be
    Formatter formatter = new SimpleHTMLFormatter("", "");

    //Scores text fragments by the number of unique query terms found
    QueryScorer scorer = new QueryScorer(query);

    //used to markup highlighted terms found in the best sections of a text
    Highlighter highlighter = new Highlighter(formatter, scorer);

    //It breaks text up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20);

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);


    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++)
    {

        int docid = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(docid);

        //Get stored text from found document
        String text = doc.get("Preferred Term");


        //a pitiful attempt to get term vectors and such like
        termsVector = reader.getTermVector(i, "Preferred Term");
        termsEnum = termsVector.iterator(termsEnum);
        while ( (term = termsEnum.next()) != null ) {
            val = term.utf8ToString();
            System.out.println("DocId: " + i);
            System.out.println("  term: " + val);

            System.out.println("  length: " + term.length);
            docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
            if (docsAndPositionsEnum.nextDoc() >= 0) {
                int freq = docsAndPositionsEnum.freq();
                System.out.println("  freq: " + docsAndPositionsEnum.freq());
                for (int j = 0; j < freq; j++) {
                    System.out.println("    [");
                    System.out.println("      position: " + docsAndPositionsEnum.nextPosition());
                    System.out.println("      offset start: " + docsAndPositionsEnum.startOffset());
                    System.out.println("      offset end: " + docsAndPositionsEnum.endOffset());
                    System.out.println("    ]");
                }
            }
        }

        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "Preferred Term", analyzer);


        //Get highlighted text fragments
        String[] frags = highlighter.getBestFragments(stream, text,20);


        for (String frag : frags)
        {


            //On the first pass  print this html out         

            if((c==1)&&(b!=c)) {
                System.out.println("<select>");
                c=b;
            }else if((b!=c)) {  //and every other time move to the next cell when b changes
                System.out.println("</select>"
                        + "</td><td>"
                        + "<select>");
                c=b;

            }

            System.out.println("<option value='"+frag+"'>"+frag+"</option>");


        }

    }

    b=b+1;


}
    dir.close();
    b=1;
    c=1;
    totalResults=0;

    //print the bottom half of the html page
    System.out.print("</select></td>\r\n" + 
            "    </tr>\r\n" + 
            "  </table>\r\n" + 
            "  </center>\r\n" + 
            "</div>\r\n" + 
            "\r\n" + 
            "</body>\r\n" + 
            "\r\n" + 
            "</html>\r\n" + 
            ""); 


    }
}

【问题讨论】:

  • 所以你只是因为这本书而使用 lucene 4?这并不能解决您的初始问题,但我建议您先升级到最新版本,或者您是否有任何原因无法做到这一点?
  • 是的,Dom,现在没有真正的理由。这本书只是为了让我了解 Lucene。在较新的 Lucene 版本中是否有工具可以突出显示原始字符串中的结果项?

标签: lucene highlight term-vectors


【解决方案1】:

我不知道 lucene v4 是否可行,但对于较新的版本,HighlighterUnifiedHighlighter 很容易实现。 有几个教程以不同的方式实现文本突出显示(只需谷歌它......):

如果您从一个新项目开始,我强烈建议您使用最新版本,即使您的书基于 lucene v4。这本书很适合对 lucene 的工作原理有一个基本的了解,但是使用旧版本的库是一个即时的技术部门,你以后可以处理。除此之外,较新的版本通常会提供您可能感兴趣的附加功能。

【讨论】:

    【解决方案2】:

    对于未来的读者,这是我打印出偏移量的普通旧 java 方法 (POJM)。

    generatePreviewText(analyzer, searchText, tokens, frags);

    public static void generatePreviewText(Analyzer analyzer, String inputText, List<String> tokens, String[] frags) throws IOException
    
    {
      String contents[]= {inputText}; 
      String[] foundTerms = frags;
    
      //for(int n=0;n<frags.length;++n) {
          //System.out.println("Found terms array= "+foundTerms[n]);
     // }
    
    Directory directory = new RAMDirectory();
    IndexWriterConfig config =
            new IndexWriterConfig(Version.LUCENE_40, analyzer);
    IndexWriter indexWriter = new IndexWriter(directory, config);
    
    FieldType textFieldType = new FieldType();
    textFieldType.setIndexed(true);
    textFieldType.setTokenized(true);
    textFieldType.setStored(true);
    textFieldType.setStoreTermVectors(true);
    textFieldType.setStoreTermVectorPositions(true);
    textFieldType.setStoreTermVectorOffsets(true);
    
    Document doc = new Document();
    Field textField = new Field("content", "", textFieldType);
    
    
    
    for (String content : contents) {
        textField.setStringValue(content);
        doc.removeField("content");
        doc.add(textField);
        indexWriter.addDocument(doc);
    }
    
    indexWriter.commit();
    IndexReader indexReader = DirectoryReader.open(directory);
    DocsAndPositionsEnum docsAndPositionsEnum = null;
    Terms termsVector = null;
    TermsEnum termsEnum = null;
    BytesRef term = null;
    String val = null;
    
    for (int i = 0; i < indexReader.maxDoc(); i++) {
        termsVector = indexReader.getTermVector(i, "content");
        termsEnum = termsVector.iterator(termsEnum);
        while ( (term = termsEnum.next()) != null ) {
    
    
            val = term.utf8ToString();
    
           // if(foundTerms.get(i)==val) {
    
            System.out.println("  term: " + val);
    
            System.out.println("  length: " + term.length);
            docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
            if (docsAndPositionsEnum.nextDoc() >= 0) {
                int freq = docsAndPositionsEnum.freq();
                System.out.println("  freq: " + docsAndPositionsEnum.freq());
                for (int j = 0; j < freq; j++) {
                    System.out.println("    [");
                    System.out.println("      position: " + docsAndPositionsEnum.nextPosition());
                    System.out.println("      offset start: " + docsAndPositionsEnum.startOffset());
                    System.out.println("      offset end: " + docsAndPositionsEnum.endOffset());
    
                    System.out.println("    ]");
                }
            }
    

    //} }

    }indexWriter.close();
    

    }

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2023-03-27
      • 2016-06-18
      • 2012-02-14
      • 1970-01-01
      • 1970-01-01
      • 2012-02-05
      • 2022-06-11
      • 1970-01-01
      相关资源
      最近更新 更多