1、建立索引

#coding=utf-8
from __future__ import unicode_literals
__author__ = 'zh'
import sys,os
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
import pymongo
import json
from pymongo.collection import Collection
from pymongo import database

class CreatIndex:
    def __init__(self):
        self.mongoClient = pymongo.MongoClient('192.168.229.128',27017)
        self.websdb = pymongo.database.Database(self.mongoClient,'webdb')
        self.pagesCollection = Collection(self.websdb,'pages')
    def BuiltIndex(self):
        analyzer = ChineseAnalyzer()
        # 索引模版
        schema = Schema(
            U_id=ID(stored=True),
            # md5=ID(stored=True),
            title=TEXT(stored=True,analyzer=analyzer),
            location=TEXT(stored=True),
            publish_time=DATETIME(stored=True,sortable=True),
            content=TEXT(stored=False,analyzer=analyzer)
        )
        from whoosh.filedb.filestore import FileStorage
        storage = FileStorage("../whoosh_index")
        if not os.path.exists("../whoosh_index"):
            os.mkdir("../whoosh_index")
            ix = storage.create_index(schema)
            print '建立索引文件!'
        else:
            ix=storage.open_index()

        # if not os.path.exists("whoosh_index"):
        #     os.mkdir("whoosh_index")
        #     ix = create_in("whoosh_index", schema) # for create new index
        # #ix = open_dir("tmp") # for read only
        writer = ix.writer()
        try:
            num=0
            while(True):
                # break
                try:
                    row=self.pagesCollection.find_one({'indexed':{'$exists':False}})
                    if row!=None:
                        publish_time=None
                        if row.has_key('publish_time'):
                            publish_time=row['publish_time']
                            if str(publish_time)=='' or str(publish_time)=='0':
                                publish_time=None
                        location=''
                        if row.has_key('location'):
                            location=json.JSONEncoder().encode(row['location'])

                        writer.add_document(
                        U_id=''.join(str(row['_id'])),
                        # md5=row['md5'],
                        title=row['name'],
                        location=''.join(location),
                        publish_time=publish_time,
                        content=row['information']
                        )
                        self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}})
                        num+=1
                        print row["_id"],"已建立索引!"
                    else:
                        writer.commit()
                        print "全部处理完毕"
                        # time.sleep(3600)
                        # self.BuiltIndex()
                        break
                except:
                    print row["_id"],"异常"
                    break
        except:
            writer.commit()
            print "异常"
        # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count()
            print '已处理',num,'共计', self.pagesCollection.find().count()

creatindext = CreatIndex()
creatindext.BuiltIndex()
View Code

相关文章:

  • 2022-12-23
  • 2022-01-04
  • 2022-12-23
  • 2022-03-01
  • 2022-01-17
  • 2021-12-13
  • 2021-11-07
猜你喜欢
  • 2022-12-23
  • 2021-11-19
  • 2022-02-27
  • 2022-12-23
  • 2021-08-28
  • 2021-04-04
  • 2022-12-23
相关资源
相似解决方案