ps: 以下用到的模块都是内置的, 无须安装.
最近学了scrapy 框架,发现了 yield 的影子,所以打算在这个栗子中也来复习复习
yield
和
yield from
。顺便体验一下传说中使用正则解析数据的痛苦。而为了更好体验劳动成果,使用 csv 文件格式来存储爬取的数据。发现少了几个电影的数据拿不到,但是单独把提取不出来源数据部分,使用正则解析一遍,又没有发现问题,所以正则那一块还需要改一改。
import csvimport reimport timeimport urllib.requestfrom fake_useragent import UserAgent# import os, http.cookiejar# 将数据存到 csv 文件中def save2csv(generator, filepath, field_names): with open(filepath, 'w', encoding="utf-8", newline='', ) as fp: # 使用字典的方式写入, 个人比较推荐 write = csv.DictWriter(fp, fieldnames=field_names) # 写入头部信息 write.writeheader() while True: try: msg_list = next(generator) # for msg in msg_list: # write.writerow(msg) # 既然已经是 list, 那为什么不用 writerows(内置使用map 高阶函数映射) 呢 write.writerows(msg_list) except StopIteration: # 生成器停止, 就是拿不到数据了 print("StopIteration. data is none.") break except Exception as e: print(e)# 遍历十个页面 (生成器函数)def request_url(start_url, headers): for i in range(10): url = start_url.format(i * 10) print(url) request = urllib.request.Request(url, headers=headers) # 不用这种方式了, 直接传参 headers # request.add_header("User-Agent", headers["User-Agent"]) # request.add_header("Cookie", headers["Cookie"]) response = urllib.request.urlopen(request) # msg_list = pick_movie_msgs(response) # print(len(msg_list), msg_list) # if msg_list: # yield msg_list yield from pick_movie_msgs(response) # 防止把别人的服务器轰坏了, 也防止被封 time.sleep(3)# 预编译正则, 终于体会到 `.*?` 的厉害之处了_rule = re.compile(r'
.*?>(\d+)' # 序号 + r'.*?src="https(.*?)".*?>' # 封面 url ** + r'.*?title="(.*?)"' # 电影名称 + r'.*?star">(.*?) ' # 主演 + r'.*?releasetime">.*?(\d+-\d+-\d+)' # 上映时间 + r'.*?integer">(\d+.).*?fraction">(\d).*? ', flags=re.S) # 评分# 取出有用的信息def pick_movie_msgs(response): page_source_data = response.read() page_source = page_source_data.decode("utf-8") # print(page_source) data_list = _rule.findall(page_source) # print(data_list) # msg_list = [] for data in data_list: # data is type of tuple if data: msg = { "order_number": data[0].strip(), "img_url": "".join(("https", data[1].strip())), "title": data[2].strip(), "star": data[3].strip().split(":")[1], "release_time": data[4].strip(), "score": "".join((data[5].strip(), data[6].strip())), } # msg_list.append(msg) # return msg_list yield msg# 程序入口函数def main(): # 猫眼电影榜单 top 100 start_url = "https://maoyan.com/board/4?offset={}" headers = { "User-Agent": UserAgent().random, "Cookie": "__mta=188595863.1563115601062.1563116145904.1563361495934.20; uuid_n_v=v1; uuid=3012C9D0A64611E9AB785F96570DC49513513EE76D2F46EDBCA08C92F92471ED; _lxsdk_cuid=16bf0f33aa941-0e8e80acdaf83f-e343166-e1000-16bf0f33aaac8; _lxsdk=3012C9D0A64611E9AB785F96570DC49513513EE76D2F46EDBCA08C92F92471ED; _csrf=f6517f9b55f5215e1d80ab9ec8c7885968e50a0d83dd8bac15126cdd0969810c; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=188595863.1563115601062.1563116145904.1563361420789.20; _lxsdk_s=16bff99cb1a-d00-c10-473%7C%7C8", } filepath = './maoyan.csv' data_head = ["order_number", "img_url", "title", "star", "release_time", "score"] # 生成器 generator = request_url(start_url, headers) print(type(generator)) save2csv(generator, filepath, data_head)# 开始爬main()