1、建立索引
#coding=utf-8 from __future__ import unicode_literals __author__ = 'zh' import sys,os from whoosh.index import create_in,open_dir from whoosh.fields import * from jieba.analyse import ChineseAnalyzer import pymongo import json from pymongo.collection import Collection from pymongo import database class CreatIndex: def __init__(self): self.mongoClient = pymongo.MongoClient('192.168.229.128',27017) self.websdb = pymongo.database.Database(self.mongoClient,'webdb') self.pagesCollection = Collection(self.websdb,'pages') def BuiltIndex(self): analyzer = ChineseAnalyzer() # 索引模版 schema = Schema( U_id=ID(stored=True), # md5=ID(stored=True), title=TEXT(stored=True,analyzer=analyzer), location=TEXT(stored=True), publish_time=DATETIME(stored=True,sortable=True), content=TEXT(stored=False,analyzer=analyzer) ) from whoosh.filedb.filestore import FileStorage storage = FileStorage("../whoosh_index") if not os.path.exists("../whoosh_index"): os.mkdir("../whoosh_index") ix = storage.create_index(schema) print '建立索引文件!' else: ix=storage.open_index() # if not os.path.exists("whoosh_index"): # os.mkdir("whoosh_index") # ix = create_in("whoosh_index", schema) # for create new index # #ix = open_dir("tmp") # for read only writer = ix.writer() try: num=0 while(True): # break try: row=self.pagesCollection.find_one({'indexed':{'$exists':False}}) if row!=None: publish_time=None if row.has_key('publish_time'): publish_time=row['publish_time'] if str(publish_time)=='' or str(publish_time)=='0': publish_time=None location='' if row.has_key('location'): location=json.JSONEncoder().encode(row['location']) writer.add_document( U_id=''.join(str(row['_id'])), # md5=row['md5'], title=row['name'], location=''.join(location), publish_time=publish_time, content=row['information'] ) self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}}) num+=1 print row["_id"],"已建立索引!" else: writer.commit() print "全部处理完毕" # time.sleep(3600) # self.BuiltIndex() break except: print row["_id"],"异常" break except: writer.commit() print "异常" # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count() print '已处理',num,'共计', self.pagesCollection.find().count() creatindext = CreatIndex() creatindext.BuiltIndex()