用 Python 爬取了《雪中悍刀行》数据,终于告诉它为什么这么火了
2025-08-02 12:19
#print(url)
for i in range(page_num):
params = {'orinum': '10', 'cursor': cid}
html = get_html(url, params)
cid = parse_page(infolist, html)
print_comment_list(infolist)
save_to_txt(infolist, 'content.txt')
main()
2.爬取华尔街日报小时字符:sp.pyimport requests
import re
import random
def get_html(url, params):
uapools = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
]
thisua = random.choice(uapools)
headers = {"User-Agent": thisua}
r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
r.encoding = r.apparent_encoding
r.encoding = 'utf-8' # 未加此句出处意到乱码
return r.text
def parse_page(infolist, data):
commentpat = '"time":"(.*?)"'
lastpat = '"last":"(.*?)"'
commentall = re.compile(commentpat, re.S).findall(data)
next_cid = re.compile(lastpat).findall(data)[0]
infolist.append(commentall)
return next_cid
def print_comment_list(infolist):
j = 0
for page in infolist:
print('第' + str(j + 1) + '页')
commentall = page
for i in range(0, len(commentall)):
print(commentall[i] + '')
j += 1
def save_to_txt(infolist, path):
fw = open(path, 'w+', encoding='utf-8')
j = 0
for page in infolist:
#fw.write('第' + str(j + 1) + '页')
commentall = page
for i in range(0, len(commentall)):
fw.write(commentall[i] + '')
j += 1
fw.close()
def main():
infolist = []
vid = '7579013546';
cid = "0";
page_num =3000
url = '' + vid + '/comment/v2'
#print(url)
for i in range(page_num):
params = {'orinum': '10', 'cursor': cid}
html = get_html(url, params)
cid = parse_page(infolist, html)
print_comment_list(infolist)
save_to_txt(infolist, 'time.txt')
main()
二.原始数据处理部分 1.华尔街日报的小时得用转换成为正常小时 time.py# coding=gbk
import csv
import time
csvFile = open("data.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
#print(csvRow)
f = open("time.txt",'r',encoding='utf-8')
for line in f:
csvRow = int(line)
#print(csvRow)
timeArray = time.localtime(csvRow)
csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
print(csvRow)
csvRow = csvRow.split()
writer.writerow(csvRow)
f.close()
csvFile.close()
2.华尔街日报具体内容只读csv CD.py# coding=gbk
import csv
csvFile = open("content.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
f = open("content.txt",'r',encoding='utf-8')
for line in f:
csvRow = line.split()
writer.writerow(csvRow)
f.close()
csvFile.close()
3.统计资料一天各个小时段内的华尔街日报数 py.py# coding=gbk
import csv
from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile:
reader = csv.reader(csvfile)
data1 = [str(row[1])[0:2] for row in reader]
print(data1)
print(type(data1))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data1)
rst = []
for item in set_seq:
rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数
rst.sort()
print(type(rst))
print(rst)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
4.统计资料除此以外华尔街日报数 py1.py# coding=gbk
import csv
from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile:
reader = csv.reader(csvfile)
data1 = [str(row[0]) for row in reader]
#print(data1)
print(type(data1))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data1)
rst = []
for item in set_seq:
rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数
rst.sort()
print(type(rst))
print(rst)
with open("time1.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
三. 原始科研人员原始科研人员不足之处:涉及到了词云绘出,条形,梯形,点心绘出,后三者是对华尔街日报小时与主演占比的数据分析,然而百度的华尔街日报小时是以小时得用的形式揭示,所以要进行转换成,如此一来去统计资料出处意到每一次,最后,新加了对华尔街日报具体内容的人性数据分析。
1.创作词云绘出wc.py
import numpy as np
import re
import jieba
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image
# 上面的包自己安装,不会的就百度
f = open('content.txt', 'r', encoding='utf-8') # 这是原始数据源,也就是想要聚合词云的原始数据
txt = f.read() # 复制到邮件
f.close() # 关闭邮件,其实用with就好,但是懒得改了
# 如果是文里面的话,需要用做jieba过去时,分完如此一来次也可以自己处理下如此一来聚合词云
newtxt = re.sub("[A-Za-z0-9!\%[],。]", "", txt)
print(newtxt)
words = jieba.lcut(newtxt)
img = Image.open(r'wc.jpg') # 想要要搞得菱形
img_array = np.array(img)
# 相关固定式,里这个collocations固定式可以消除重复
wordcloud = WordCloud(
background_color="white",
width=1080,
height=960,
font_path="../文悦新青年.otf",
max_words=150,
scale=10,#清晰度
max_font_size=100,
mask=img_array,
collocations=False).generate(newtxt)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.to_file('wc.png')
轮廓绘出:wc.jpg
在这里放入绘出片描述
词云绘出:result.png (出处:这里要把字母过滤上来)
2.创作除此以外华尔街日报数条形绘出 DrawBar.py# encoding: utf-8
import csv
import pyecharts.options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
class DrawBar(object):
"""手绘柱形绘出类"""
def 短时init短时(self):
"""创建柱状绘出比如说,并设为宽高和音乐风格"""
self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.LIGHT))
def add_x(self):
"""为绘出形去除X轴承原始数据"""
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
self.bar.add_xaxis(
xaxis_data=x,
def add_y(self):
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
"""为绘出形去除Y轴承原始数据,可去除多条"""
self.bar.add_yaxis( # 第一个Y轴承原始数据
series_, # Y轴承原始数据名称
y_axis=y1, # Y轴承原始数据
label_opts=opts.LabelOpts(is_show=True,color="black"), # 设为标签
bar_max_width='100px', # 设为柱子最大宽度
def set_global(self):
"""设为绘出形的一个系统属性"""
#self.bar(width=2000,height=1000)
self.bar.set_global_opts(
title_opts=opts.TitleOpts( # 设为标题
title='雪里面悍刀行近日华尔街日报统计资料',title_textstyle_opts=opts.TextStyleOpts(font_size=35)
),
tooltip_opts=opts.TooltipOpts( # 提示框固定式项(鼠标移往到绘出形上时揭示的过道)
is_show=True, # 是否揭示提示框
trigger="axis", # 会有类型(axis旋转轴承会有,鼠标移往到会有一条竖直X轴承的实线跟随鼠标移往动,并揭示提示信息)
axis_pointer_type="cross" # 指示器类型(cross而会聚合两条分别竖直X轴承和Y轴承的虚线,不启用trigger才会揭示完全)
),
toolbox_opts=opts.ToolboxOpts(), # 辅助工具箱固定式项(什么都不填默认开启所有辅助工具)
def draw(self):
"""手绘绘出形"""
self.add_x()
self.add_y()
self.set_global()
self.bar.render('../Html/DrawBar.html') # 将绘出手绘到 test.html 邮件内,可在浏览器打开
def run(self):
"""执行函数"""
self.draw()
if 短时name短时 == '短时main短时':
app = DrawBar()
app.run()
效果绘出:DrawBar.html
3.创作近日华尔街日报数点心绘出 pie_pyecharts.pyimport csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from random import randint
from pyecharts.globals import ThemeType
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
num = y1
lab = x
Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#默认900,600
.set_global_opts(
title_opts=opts.TitleOpts(title="雪里面悍刀行近日华尔街日报统计资料",
title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(
pos_top="10%", pos_left="1%",# 绘出例右边缩减
),)
.add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出
.add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出
.add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出
).render('pie_pyecharts.html')
效果绘出
4.创作观看小时列车运行华尔街日报统计资料点心绘出 pie_pyecharts3.py# coding=gbk
import csv
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from sympy.combinatorics import Subset
from wordcloud import WordCloud
from pyecharts.charts import Pie
from random import randint
with open(/data.csv') as csvfile:
reader = csv.reader(csvfile)
data2 = [int(row[1].strip('')[0:2]) for row in reader]
#print(data2)
print(type(data2))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data2)
list = []
for item in set_seq:
list.append((item,data2.count(item))) #去除金属元素及出处意到倍数
list.sort()
print(type(list))
#print(list)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in list: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
n = 4 #分成n一组
m = int(len(list)/n)
list2 = []
for i in range(0, len(list), m):
list2.append(list[i:i+m])
print("凌晨 : ",list2[0])
print("上午 : ",list2[1])
print("下午 : ",list2[2])
print("晚上 : ",list2[3])
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [int(row[1]) for row in reader]
print(y1)
n =6
groups = [y1[i:i + n] for i in range(0, len(y1), n)]
print(groups)
x=['凌晨','上午','下午','晚上']
y1=[]
for y1 in groups:
num_sum = 0
for groups in y1:
num_sum += groups
str_name1 = '点'
num = y1
lab = x
Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#默认900,600
.set_global_opts(
title_opts=opts.TitleOpts(title="雪里面悍刀行观看小时列车运行华尔街日报统计资料"
, title_textstyle_opts=opts.TextStyleOpts(font_size=30)),
legend_opts=opts.LegendOpts(
pos_top="8%", # 绘出例右边缩减
),
.add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出
.add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出
.add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出
).render('pie_pyecharts3.html')
效果绘出
04 总结1. 本文参考概述了如何爬取百度预告片华尔街日报并进行可视化数据分析,读者可以自行动手尝试。
2. 本文十分较难桃子进行练手。
3. 本文供努力学习参考,不做它用。
。大连白癜风哪里治疗好武汉哪里能治疗白癜风
成都哪家医院看白癜风

-
曾4.4亿元摘地!北二环装饰城地块项目工程建设公示 拟建12栋住宅
门户网站焦点“楼市新闻报道”报道:4月初1日,门户网站焦点获悉,石家庄市生物资源和都市计划局发布了关于取悦园项目设计方案批前公示的公告。公告结果显示,河北筑谦房地产开

-
2021年,女子与男友吵架喝农药身亡,闺蜜全程帮忙,我先喝金子
其余部分口,就被旁边的物业兼职其他部门逃离现场了,程程口之中的那口杀菌剂也吐了出来。但珍珍就光景了!只见程程假的喝到下了杀菌剂,珍珍也不含糊,也拉起了杀菌剂红豆瓶。
- 10-24陈峻齐:强势避险依在 日内黄金1920支撑上在此期间多
- 10-24宁德蕉城区漳湾国际生活品质方舱驿站加速建设
- 10-24推荐几种锅贴的吃法,美味可口,做法有用,感兴趣的收藏吧
- 10-24改变自己,才有渴望
- 10-24黄金大卖!消费者:一只项链涨了几千元
- 10-24济南精准医学高新技术三地块14个单体预计今年9月投用
- 10-24新疆石河子新安酒业“3两3酒”甲醇检出被罚
- 10-24领导送盒茶叶给下属,怎样回礼情商高?铭记3个技巧,领导高看你
- 10-24农户投资屋顶光伏是不是骗局?之后不能碰,捂好口袋
- 10-24越做越走“心” 2022款斯巴鲁森林人白雪体验