Python 爬虫分析后疫情电影现状:从低迷到稳定
数据获取
# 抓取票房数据
def crawl(path):
duration = (datetime.datetime.now()-datetime.datetime.strptime('20200720', '%Y%m%d')).days
days = '20200720'
df =pd.DataFrame()
for i in range(duration):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'X-FOR-WITH':'Rz7i6ufGHjmGodGgu8BbUrav6q0DVrkeC5I7iR1aieo0JKLMIWrKB23CUCa30sB5g2mut8nHm5zagQwr8n9WzUV1B2MdplRa38rPEdN5FR5Ei9Op0hWjBndw7WgEr+4N2K5oMAp85ve1WArjpDa2XhOtge64qDRoEy53qsLl13UrBMVcg7Z2fQU48wHxEjkHsSGWySnSWHBs/flWxyj3OX+OSXf2fzg8fIVPTLI/k+w='}
url = 'http://piaofang.maoyan.com/dashboard-ajax/movie?showDate={}'.format(days)
r = requests.get(url, headers=headers)
films_message = r.json()['movieList']['list']
# 结果存入dataframe
df2 = pd.DataFrame(films_message)
# 插入时间列
df2.insert(0, 'date', days)
# 插入每日排名
df2.insert(1, 'rank', df2.index)
# 合并多个dataframe
df = pd.concat([df, df2], axis=0, ignore_index=True)
# 日期+1天
days= (datetime.datetime.strptime(days, '%Y%m%d')+ datetime.timedelta(days=1)).strftime('%Y%m%d')
print(df.shape,days)
except:
print(days)
days= (datetime.datetime.strptime(days, '%Y%m%d')+ datetime.timedelta(days=1)).strftime('%Y%m%d')
pass
# 保存
writer = pd.ExcelWriter(path)
df.to_excel(writer,index=False, encoding="utf_8_sig")
writer.save()
图1 票房数据样例
图2 豆瓣影评样例
# 获取豆瓣电影ID
def crawl_ID():
data = pd.read_excel('票房数据.xlsx',sheet_name='上映')
movies = list(data['电影名称'].unique())
for name in movies:
url = 'https://movie.douban.com/subject_search?search_text={}'.format(name)
# 用chrome浏览器打开
driver = webdriver.Chrome()
# 打开网址
driver.get(url)
time.sleep(2)
html = driver.page_source
soup1 = BeautifulSoup(html, 'lxml')
if len(soup1.select('a.title-text')) > 0:
targetUrl = soup1.select('a.title-text')[0].get('href')
print(targetUrl)
redis_db.sadd('douban_movieId', targetUrl)
else:
print("无结果")
driver.close()
# 构建豆瓣影评链接
def get_url():
douban_movieId_list = redis_db.smembers('douban_movieId')
for douban_movieId in douban_movieId_list:
start = 20
for i in range(25):
# 构建URL
page = start * i
url = douban_movieId + 'comments?start={}&limit=20&sort=new_score&status=P'.format(page)
print(url)
redis_db.sadd('douban_movieURL', url)
数据可视化
在对抓取后的数据进行清洗后,将数据导入Tableau中进行可视化。
图3 整体电影概况
图4 重映电影概况
图5 新上映电影概况
图6 《八佰》豆瓣影评词云图
图7 《花木兰》豆瓣影评词云图
结语
作者简介:
更多精彩推荐
☞华为澳大利亚大动作,终止4.9亿投资;iPhone 12 或10月13日发布;Swift正式登陆Win 10 | 极客头条
☞硅谷2020最新大数据学习路线:科学使用这一招,12周助你成为数据分析师
☞国产开源,GitHub 标星 47000+ ,百度飞桨从打响第一枪到战役突围
☞用以太坊承载的比特币,还只是起步阶段
点分享 点点赞 点在看
关注公众号:拾黑(shiheibook)了解更多
[广告]赞助链接:
四季很好,只要有你,文娱排行榜:https://www.yaopaiming.com/
让资讯触达的更精准有趣:https://www.0xu.cn/