使用 elasticsearch-dsl-py 索引和过滤文档答案

【问题标题】：Indexing and percolating documents with elasticsearch-dsl-py使用 elasticsearch-dsl-py 索引和过滤文档
【发布时间】：2018-08-29 10:53:25
【问题描述】：

我正在为一个检索信息研讨会进行调查。我有一个带有文章列表的json 文件，我需要在使用带有突出显示的过滤器之后对它们进行索引。

在终端中执行此操作的步骤列表如下：
1. 使用渗透创建地图。

curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
    "mappings": {
        "_doc": {
            "properties": {
                "title": {
                    "type": "text"
                },
                "query": {
                    "type": "percolator"
                }
            }
        }
    }
}
'

索引一篇新文章：

curl -XPUT 'localhost:9200/my-index/_doc/1?refresh&pretty' -H 'Content-Type: application/json' -d'
{           
    "CourseId":35,
      "UnitId":12390,
      "id":"16069",
      "CourseName":"ARK102U_ARKEOLOJİK ALAN YÖNETİMİ",
      "FieldId":8,
      "field":"TARİH",
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "mirası" } },
                { "span_term" : { "title" : "sözleşmesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }
}
'

渗透文档：

curl -XGET 'localhost:9200/my-index/_search?pretty' -H 'Content-Type: application/json' -d'
{
    "query" : {
        "percolate" : {
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir."
            }
        }
    },

    "highlight": {
      "fields": {
        "title": {}
      }
    }
}
'

到现在为止我都有这个代码：

import json
from elasticsearch_dsl import (
DocType,
Integer,
Percolator,
Text,
)

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']

# Creating a elasticsearch connection
# connections.create_connection(hosts=['localhost'], port=['9200'], timeout=20)
"""
curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
    "mappings": {
        "_doc": {
            "properties": {
                "title": {
                    "type": "text"
                },
                "query": {
                    "type": "percolator"
                }
            }
        }
    }
}
'

"""

class Documment(DocType):
    course_id = Integer()
    unit_id = Integer()
    # title = Text()
    id = Integer()
    course_name = Text()
    field_id = Integer()
    field = Text()


    class Meta:
        index = 'titles_index'


                properties={
                    'title': Text(),
                    'query': Percolator()
                 }

"""
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "mirası" } },
                { "span_term" : { "title" : "sözleşmesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }

"""

for doc in docs:

    terms = docs['title'].split(“ ”)
    course_id = docs['CourseId']
    unit_id = docs['UnitId']
    id = docs['id']
    course_name = docs['CourseName']
    field_id = docs['FieldId']
    field = docs['field']

更新： 谢谢你的回答，我现在有这个：

import json

from elasticsearch_dsl import (
    connections,
    DocType,
    Mapping,
    Percolator,
    Text
)
from elasticsearch_dsl.query import (
    SpanNear,
    SpanTerm
)
from elasticsearch import Elasticsearch

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']


# creating a new default elasticsearch connection
connections.configure(
    default={'hosts': 'localhost:9200'},
)


class Document(DocType):
    title = Text()
    query = Percolator()

    class Meta:
        index = 'title-index'
        doc_type = '_doc'

    def save(self, **kwargs):
        return super(Document, self).save(**kwargs)


# create the mappings in elasticsearch
Document.init()

# index the query
for doc in docs:
    terms = doc['title'].split(" ")
    clauses = []
    for term in terms:
        field = SpanTerm(title=term)
        clauses.append(field)
    query = SpanNear(clauses=clauses)
    item = Document(title=doc['title'],query=query)
    item.save()

一切正常，但我现在有两个目标：

在索引 dict 中随机数量的项目后出现下一个错误：

elasticsearch.exceptions.AuthorizationException: TransportError(403, 
'cluster_block_exception', 'blocked by: [FORBIDDEN/12/index read-only 
/ allow delete (api)];')

我知道我可以使用这个命令来解决这个问题：curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'

更新最后我解决了它删除数据文件夹。

但现在我在索引中进行搜索，但我什么也没得到：

>>> text='Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir.'
>>> s = Search().using(client).query("percolate", field='query', document={'title': text}).highlight('title')
>>> print(s.to_dict())
{'query': {'percolate': {'field': 'query', 'document': {'title': 'Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> "kazıbilim" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir.'}}}, 'highlight': {'fields': {'title': {}}}}
>>> response = s.execute()
>>> response
<Response: {}>

这是我对curl的尝试：

 curl -XGET 'localhost:9200/title-index/_search?pretty' -H 'Content-Type: application/json' -d '{  
    "query" : {        
        "percolate" : {       
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir."
            }
        }
    },            
    "highlight": {
      "fields": {  
        "title": {}
      }
    }
}'
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 0,
    "max_score" : null,
    "hits" : [ ]
  }
}

我得到了可变的统计数据，但没有得到结果：

>>> response.to_dict()
{'took': 9, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}
>>> response
{'took': 12, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}
>>> response
{'took': 12, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}

谁能帮帮我？

【问题讨论】：

标签： python elasticsearch elasticsearch-dsl elasticsearch-dsl-py

【解决方案1】：

第一步正确，即映射正确。但是，您需要首先索引一个查询，这就是渗透的重点。因此，让我们为您的查询编制索引：

curl -XPUT 'localhost:9200/my-index/_doc/my-span-query?refresh&pretty' -H 'Content-Type: application/json' -d '{           
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "mirası" } },
                { "span_term" : { "title" : "sözleşmesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }
}'

然后想法是找出哪个查询将匹配您正在渗透的文档，所以让我们渗透文档：

curl -XGET 'localhost:9200/my-index/_search?pretty' -H 'Content-Type: application/json' -d '{
    "query" : {
        "percolate" : {
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir."
            }
        }
    },
    "highlight": {
      "fields": {
        "title": {}
      }
    }
}'

您会在收到此响应时突出显示您可以看到my-span-query 与给定文档匹配的位置：

{
  "took": 104,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.8630463,
    "hits": [
      {
        "_index": "my-index",
        "_type": "_doc",
        "_id": "my-span-query",
        "_score": 0.8630463,
        "_source": {
          "query": {
            "span_near": {
              "clauses": [
                {
                  "span_term": {
                    "title": "dünya"
                  }
                },
                {
                  "span_term": {
                    "title": "mirası"
                  }
                },
                {
                  "span_term": {
                    "title": "sözleşmesi"
                  }
                }
              ],
              "slop": 0,
              "in_order": true
            }
          }
        },
        "fields": {
          "_percolator_document_slot": [
            0
          ]
        },
        "highlight": {
          "title": [
            "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, <em>dünya</em> <em>mirası</em> <em>sözleşmesi</em> sosyoloji, coğrafya"
          ]
        }
      }
    ]
  }
}

更新

同样的事情使用elasticsearch-py-dsl:

from elasticsearch_dsl import DocType, Text, Percolator
from elasticsearch import Elasticsearch

class Document(DocType):
    title = Text()
    query = Percolator()

    class Meta:
        index = 'my-index'

    def save(self, ** kwargs):
        return super(Document, self).save(** kwargs)

# 1a. create the mappings in elasticsearch
Document.init()

# 1b. or another alternative way of saving the mapping
query_mapping = elasticsearch_dsl.Mapping('_doc')
query_mapping.field('title', 'text')
query_mapping.field('query', 'percolator')
query_mapping.save('my-index')

# 2. index the query
query = Document(query={...your span query here...})
query.save()

# 3. send the percolate query
client = Elasticsearch()
response = client.search(
    index="my-index",
    body={
      "query" : {
        "percolate" : {
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde> \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir."
            }
        }
    },
    "highlight": {
      "fields": {
        "title": {}
      }
    }
  }
)

更新 2

没有理由将title 与查询一起存储，您只需存储查询，因此您的代码应如下所示：

# index the query
for doc in docs:
    terms = doc['title'].split(" ")
    clauses = []
    for term in terms:
        field = SpanTerm(title=term)
        clauses.append(field)
    query = SpanNear(clauses=clauses)
    item = Document(query=query)         <-- change this line
    item.save()

【讨论】：

对不起，我知道怎么做；我需要做同样的事情，但使用elastic-search-dsl-py。
嗯，第二步（即索引查询）不正确，不可能那样工作。无论如何，让我更新我的答案并在其中添加一些 python
我在每一步之前添加了 cmets，它非常简单，并且与上面的每个 curl 命令相同，1) 为 percolator 字段创建映射，2) 索引查询，3) percolate一个示例文档。
如何索引我的字典中的所有文档？
我知道了，第二个方法我得到了_doc 的正确映射，但是第一个文档我得到了doc 的映射，为什么？