from bs4 import BeautifulSoup
import os
filepath = 'D:\\pymine\\clean\\spider_map\\baidu_map_html_firstpage_pc_test\\'
pathDir = os.listdir(filepath)
name_poi_dic = {}
need_todo_request = ['搜索结果']
no_list = ['全国范围内未找到相关地点', '共找到0个搜索结果']
bd_no_this_name_str = '百度对此条无结果'
#未找到结果,为您提供"大兴店"的搜索结果
for allDir in pathDir:
    child = os.path.join('%s%s' % (filepath, allDir))
    if child.find('&')>-1 or child.find('170')>-1:
        os.remove(child)
    requested_file = child.split('baidu_map_html_firstpage_pc')[1].split('&')[0].split('.html')[0].replace('\\', '')
    name_poi_dic[requested_file] = {}
    name_poi_dic[requested_file]['poi_list'] = []
    mybytes = open(child, 'r', encoding='utf-8')
    soup = BeautifulSoup(mybytes, "lxml")
    soup_text = soup.text

    if soup_text.find('全国范围内未找到') > -1:
        name_poi_dic[requested_file]['poi_list'].append(bd_no_this_name_str)
    elif soup_text.find('商户免费标注') > -1:
        name_l, addr_l = soup.find_all(class_='n-blue'), soup.find_all(class_='n-grey')
        len_, len_addr = len(name_l), len(addr_l)
        for index_ in range(0, len_, 1):
            dic_ = {}
            if index_ < len_addr:
                dic_['name'], dic_['addr'] = name_l[index_].text, addr_l[index_].text
            else:
                dic_['name'], dic_['addr'] = name_l[index_].text, '百度此处无地址'
            name_poi_dic[requested_file]['poi_list'].append(dic_)
    elif soup_text.find('m.hao123.com') > -1:
        name_l, addr_l = soup.find_all(class_='text-ellipsis -ft-primary -ft-large'), soup.find_all(
            class_='dis-inf text-ellipsis -col-auto')
        # len_ =min(len(name_l),len(addr_l))
        len_, len_addr = len(name_l), len(addr_l)
        for index_ in range(0, len_, 1):
            dic_ = {}
            if index_ < len_addr:
                dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], addr_l[index_].text
            else:
                dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], '百度此处无地址'
            name_poi_dic[requested_file]['poi_list'].append(dic_)
    elif soup_text.find('地址:') > -1:
        dic_ = {}
        dic_['name'], dic_['addr'] = soup.find_all('td').text.split('地址:')
        name_poi_dic[requested_file]['poi_list'].append(dic_)
    else:
        mybytes.close()
        os.remove(child)
        print('TODO', requested_file)

  

分类:

技术点:

相关文章: