Python爬虫爬取猫眼电影热映口碑榜
程序员文章站
2022-06-30 11:47:36
...
通过学习后的简单的爬取练习。
import requests
import re
import json
import time
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
None
def parse_page(html):
"""解析页面"""
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
print(items)
for item in items:
yield {
'index': item[0],
'title': item[1].strip(),
'actor': item[2].strip()[3:] if len(item[2]) > 3 else '',
'time': item[3].strip()[5:] if len(item[3]) > 5 else '',
'score': item[4].strip() + item[5].strip()
}
def write_file(content):
with open('热映口碑榜.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False)+'\n')
url = 'https://maoyan.com/board/7'
html = get_one_page(url)
if __name__ == '__main__':
time.sleep(1)
for item in parse_page(html):
write_file(item)
下一篇: python爬虫爬取微博知乎热搜榜