admin 管理员组文章数量: 887021
# 用正则爬取网页数据, 并存储到本地pymysql数据库中
import re,
random, time, pymysql from urllib.request import Request, urlopen class QSBKDataTool(object): # [('\n猩猩眨呀眨\n', '24', '\n\n\n昨晚同学聚会,以前的死对头非要坐我旁边,盯着我脸上的痘痘说她现在不吃肉,皮肤变好了。<br/>我放下筷子盯着她的水桶腰,说我只吃一点点肉,但不吃主食,所以体重控制的不错……<br/>聚会结束各自离去,也许是冤家路窄,我俩在牛肉面馆里又遇到了。\n\n', '5785', '43')] remove_n = repile(r'\n', re.S) remove_br = repile(r'<br/>|<br>', re.S) @classmethod def process_data(cls, origin_data): result_data = [] for data in origin_data: # 处理昵称 data[0] nick_name = data[0] nick_name = re.sub(cls.remove_n, '', nick_name) # str字符串中的replace() # 处理内容 data[3] content = data[3] content = re.sub(cls.remove_n, '', content) content = re.sub(cls.remove_br, '', content) result_data.append((nick_name, data[1], data[2], content, data[4], data[5])) return result_data @classmethod def process_next(cls, data): next_page_str = data[0][1] next_page_str = re.sub(cls.remove_n, '', next_page_str) return (data[0][0], next_page_str) class QSBKDBTool(object): db = None cursor = None @classmethod def connect_db(cls): cls.db = pymysql.connect(host='localhost', user='root', passwd='123456', db='qsbk', port=3306, charset='utf8') cls.cursor = cls.db.cursor() @classmethod def save_list_data(cls, list_data): # 遍历list_data,执行insert操作 for q_name, q_age, q_href, q_content, q_smail_num, q_comment_num in list_data: # 表使用文章的id作为主键 /article/120510346 q_id = q_href.split('/')[2] insert_sql = "INSERT INTO qsbk (`q_id`, `q_name`, `q_age`, `q_href`, `q_content`, `q_smail_num`, q_comment_num ) VALUES (%s, %s, %s, %s, %s, %s, %s)" try: cls.cursor.execute(insert_sql, (q_id, q_name, q_age, q_href, q_content, q_smail_num, q_comment_num )) cls.dbmit() except Exception as e: print('主键冲突或者内容有表情数据,跳过...') cls.db.rollback() @classmethod def save_detail_data(cls,q_id,detail_data): if detail_data: for comment in detail_data: insert_sql = "INSERT INTO detail (comment, q_id) VALUES (%s,%s)" try: cls.cursor.execute(insert_sql, (comment, q_id)) cls.dbmit() except Exception as e: print('详情页主键冲突或者内容有表情数据,跳过...',e) cls.db.rollback() @classmethod def connect_close(cls): cls.cursor.close() cls.db.close() class QSBKDetailSpider(object): """ 解析详情页 """ user_agent_list = [ "User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" ] def __init__(self, url): self.url = url def get_page_detail(self): user_agent = random.choice(self.user_agent_list) request = Request(self.url, headers={'User-Agent': user_agent}) # 源代码中可能含有表情 try: response = urlopen(request) # 表情在decode()的时候,有可能会成功,但是存入数据库失败 # 表情在decode()的时候,直接异常 try: origtin_html = response.read().decode() except Exception as e: print('decode()失败,原因:{},url:{}'.format(e, self.url)) # 这一页源代码获取失败 return None except Exception as e: print('urlopen()失败,原因:{}, url:{}'.format(e, self.url)) return None else: return origtin_html def parse_page_detail(self, origin_html): if origin_html != None: comment = re.findall(repile(r'<a.*?class="userlogin".*?<span class="body">(.*?)</span>', re.S), origin_html) # pop()返回被移除的对象 comment.pop(len(comment)-1) return comment else: print('详情页源代码为空') class QSBKSpider(object): user_agent_list = [ "User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" ] def __init__(self): self.base_url = 'https://www.qiushibaike/hot/page/' def get_page_list(self, page_num): """ 获取列表页数据 :param page_num: 页码 :return: 数据 """ url = self.base_url + str(page_num) user_agent = random.choice(self.user_agent_list) request = Request(url, headers={'User-Agent': user_agent}) # 源代码中可能含有表情 try: response = urlopen(request) # 表情在decode()的时候,有可能会成功,但是存入数据库失败 # 表情在decode()的时候,直接异常 try: origtin_html = response.read().decode() except Exception as e: print('decode()失败,原因:{},url:{}'.format(e, url)) # 这一页源代码获取失败 return None except Exception as e: print('urlopen()失败,原因:{}, url:{}'.format(e, url)) return None else: return origtin_html def parser_page_list(self, origin_html): """ 解析列表页的数据 :param origin_html: 某一页的网页源代码 :return: 解析并处理后的数据 """ if origin_html != None: pattern = repile(r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?">(.*?)</div>.*?<a.*?href="(.*?)".*?>.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>', re.S) origin_data = re.findall(pattern, origin_html) result_data = QSBKDataTool.process_data(origin_data) # 将列表页的resul_data保存到数据库 QSBKDBTool.save_list_data(result_data) # 处理详情页数据 self.get_detail_url(result_data) # 处理下一页 next_page_pattern = repile(r'.*<span class="page-numbers">.*?<a href="(.*?)".*?>.*?<span.*?>(.*?)</span>', re.S) res = re.findall(next_page_pattern, origin_html) # 判断是否有下一页,如果有,继续上述的逻辑;如果没有,停止爬虫 next_data = QSBKDataTool.process_next(res) if next_data[1] == '下一页': relation_url = next_data[0] number = re.search('(\d+)', relation_url).group() html = self.get_page_list(number) self.parser_page_list(html) time.sleep(3) else: print('已经是最后一页了') else: print('origin_html为None') def get_detail_url(self, data): for data_tuple in data: detail_url = 'https://www.qiushibaike' + data_tuple[2] q_id = data_tuple[2].split('/')[2] detail_spider = QSBKDetailSpider(detail_url) detail_html = detail_spider.get_page_detail() res = detail_spider.parse_page_detail(detail_html) QSBKDBTool.save_detail_data(q_id, res) if __name__ == "__main__": QSBKDBTool.connect_db() qsbk = QSBKSpider() origin_html = qsbk.get_page_list(1) qsbk.parser_page_list(origin_html) QSBKDBTool.connect_close()
版权声明:本文标题:正则爬取网页数据(二) 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.freenas.com.cn/jishu/1726434976h960042.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论