# coding: utf-8 import datetime import urllib.parse import urllib.request from urllib.error import * from bs4 import BeautifulSoup import re import os def get_html(url, values): html = \'\' status_code = 200 user_agent = \'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' headers = {\'User-Agent\': user_agent} data = urllib.parse.urlencode(values).encode(encoding=\'UTF8\') for i in range(1,3): req = urllib.request.Request(url=url, headers=headers, data=data) try: response = urllib.request.urlopen(req) except HTTPError as e: print(url, values) print(\'The server couldn\\'t fulfill the request.\') print(\'HTTP Error,code: \', e.code) status_code = int(e.code) break except URLError as e: status_code = int(e.code) print(\'We failed to reach a server.Reason: \', e.reason) print(\'url: %s, status code:%d, retry count:%d\' % (url + \'?\' + bytes.decode(data), status_code, i)) else: html = response.read( ).decode(\'gbk\') break return html, status_code def request_page(page): url = \'http://cmispub.cicpa.org.cn/cicpa2_web/PersonIndexAction.do\' values = { \'method\': \'indexQuery\', \'queryType\': \'2\', \'isStock\': \'00\', \'pageSize\': \'\', \'pageNum\': page, \'offName\': \'\', \'ascGuid\': \'\', \'perCode\': 0, \'perName\': \'\' } return get_html(url, values) def parse_cicpa_page(html): soup = BeautifulSoup(html, \'html.parser\') items = soup.select("#tabDetail a") return items def request_detail(code): print(\'request code:\', code) url = \'http://cmispub.cicpa.org.cn/cicpa2_web/07/\' + code + \'.shtml\' values = {} return get_html(url, values) def parse_detail_header(html): soup = BeautifulSoup(html, \'html.parser\') headers = soup.select("#detailtb td.tdl") line = \'\' for item in headers: line = line + item.get_text( ).strip( ) + \',\' line = line.strip(\',\') return line def parse_detail_content(html): soup = BeautifulSoup(html, \'html.parser\') headers = soup.select("#detailtb td.data_tb_content") line = \'\' for item in headers: line = line + item.get_text( ).strip( ) + \',\' line = line.strip(\',\') return line def create_file(filepath, header): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if not os.path.exists(filepath): f = open(filepath, \'w\') if len(header) > 0: f.write(header + \'\n\') f.close( ) def is_down_exists(code): return False def main(): start_time = datetime.datetime.now( ) html_dir = \'D:/crawl_data/cicpa/html/\' if not os.path.isdir(html_dir): os.makedirs(html_dir) header_file = \'D:/crawl_data/cicpa/header.csv\' need_header = not os.path.exists(header_file) datafile = \'D:/crawl_data/cicpa/data_%s.csv\' % start_time.strftime("%Y%m%d_%H%M%S_%f") page_error_file = \'D:/crawl_data/cicpa/error_page_%s.txt\' % start_time.strftime("%Y%m%d_%H%M%S_%f") detail_error_file = \'D:/crawl_data/cicpa/error_detail_%s.txt\' % start_time.strftime("%Y%m%d_%H%M%S_%f") create_file(datafile, \'\') create_file(page_error_file, \'page,status\') create_file(detail_error_file, \'code,status\') data_file_object = open(datafile, \'+w\') page_error_file_object = open(page_error_file, \'+w\') detail_error_file_object = open(detail_error_file, \'+w\') for i in range(1, 6912): print(\'request:\', i) result, status = request_page(i) if status != 200: page_error_file_object.write(str(i) + \',\' + str(status) + \'\n\') page_error_file_object.flush( ) continue items = parse_cicpa_page(result) for item in items: code = re.findall(r"javascript:viewDetail\(\\'(\w+?)\\',", str(item))[0] html_file_path = html_dir + code + \'.html\' if os.path.exists(html_file_path): continue detail_html, status = request_detail(code) if len(detail_html) == 0: detail_error_file_object.write(code + \',%d\n\' % status) detail_error_file_object.flush( ) continue if need_header: header = parse_detail_header(detail_html) f = open(header_file, \'w\') f.write(header + \'\n\') f.close() need_header = False # save base data line = parse_detail_content(detail_html) data_file_object.write(line + \'\n\') data_file_object.flush( ) # save html html_file_object = open(html_file_path, \'w\') html_file_object.write(detail_html + \'\n\') html_file_object.close( ) print(line) data_file_object.close( ) page_error_file_object.close( ) detail_error_file_object.close( ) print(\'finished in\', (datetime.datetime.now( ) - start_time).microseconds, \'ms\') if __name__ == \'__main__\': main( )