from urllib import request
from time import sleep
from lxml import etree
import csv
# import random #sleep(random.random(1)*2) 随机秒数
# 参数部分
# sz_url = \'https://sz.lianjia.com/zufang/\'
#
# header = {
#
# \'Referer\': \'https://sz.lianjia.com/zufang/\',
# \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36\',
# }
# # 请求部分
# res = request.Request(sz_url,headers=header)
#
# response = request.urlopen(res)
# result = response.read().decode()
# # print(result)
# # 筛选部分
# html = etree.HTML(result)
# name_list = html.xpath(\'//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a\')
# with open(\'house.csv\',"wb") as f:
# for name in name_list:
# title=name.attrib["title"]
# f.write(title.encode())
# f.write(\'\n\'.encode())
# print(title)
# --------------------------------------------------------------------------------------------------------------
# # 参数部分
# sz_url = \'https://sz.lianjia.com/zufang/105101400296.html\'
#
# header = {
#
# \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36\',
# }
# # 请求部分
# res = request.Request(sz_url,headers=header)
#
# response = request.urlopen(res)
# result = response.read().decode()
# # print(result)
#
# html = etree.HTML(result)
# name_list = html.xpath(\'//div[@class="brokerName"]/a\')
#
# for name in name_list:
# text = name.text
# print(text)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code tree@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# 请求测试
def getRequet(url,xpath,**headers):
default_headers = {
\'Connection\': \'keep-alive\',
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36\',
}
if headers :
headers ={ **headers, **default_headers}
else:
headers = default_headers
req = request.Request(url,headers=headers)
response = request.urlopen(req)
result = response.read().decode()
html = etree.HTML(result)
name_list = html.xpath(xpath)
return name_list
def main():
with open(\'house.csv\',"wb") as f:#打开csv文件 写入数据
# csv_file = open(\'house.csv\',\'wb\')
# csv_write = csv.writer(csv_file,dialect=\'excel\')
zf_url=\'https://sz.lianjia.com/zufang/\'#要访问的url地址
zf_xpath=\'//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a\'#租房xpath地址
name_xpath=\'//div[@class="brokerName"]/a\'#联系人名字xpath
house_list=getRequet(zf_url,zf_xpath)
for house in house_list:
print(\'正在下载:\',zf_url)#打印下载链接地址
attrib = house.attrib
house_name = attrib[\'title\']
url =attrib[\'href\']
username=getRequet(url,name_xpath)[0].text#取联系人名字的文本信息下标0
# csv_write.witerow(house_name,username)
# print(\'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\')
# print(name)
# print(url)
#
f.write(house_name.encode())#encode编码
f.write(\'\n\'.encode())
f.write(username.encode())
f.write(\'\n\'.encode())
# print(house_name)
# print(username)
sleep(1)
# print(\'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\')
print(\'下载完成\')
f.close()
if __name__==\'__main__\':
main()