用 Python 爬取了《雪中悍刀行》数据,终于告诉它为什么这么火了
2025-08-02 12:19
#print(url)
for i in range(page_num):
params = {'orinum': '10', 'cursor': cid}
html = get_html(url, params)
cid = parse_page(infolist, html)
print_comment_list(infolist)
save_to_txt(infolist, 'content.txt')
main()
2.爬取华尔街日报小时字符:sp.pyimport requests
import re
import random
def get_html(url, params):
uapools = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
]
thisua = random.choice(uapools)
headers = {"User-Agent": thisua}
r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
r.encoding = r.apparent_encoding
r.encoding = 'utf-8' # 未加此句出处意到乱码
return r.text
def parse_page(infolist, data):
commentpat = '"time":"(.*?)"'
lastpat = '"last":"(.*?)"'
commentall = re.compile(commentpat, re.S).findall(data)
next_cid = re.compile(lastpat).findall(data)[0]
infolist.append(commentall)
return next_cid
def print_comment_list(infolist):
j = 0
for page in infolist:
print('第' + str(j + 1) + '页')
commentall = page
for i in range(0, len(commentall)):
print(commentall[i] + '')
j += 1
def save_to_txt(infolist, path):
fw = open(path, 'w+', encoding='utf-8')
j = 0
for page in infolist:
#fw.write('第' + str(j + 1) + '页')
commentall = page
for i in range(0, len(commentall)):
fw.write(commentall[i] + '')
j += 1
fw.close()
def main():
infolist = []
vid = '7579013546';
cid = "0";
page_num =3000
url = '' + vid + '/comment/v2'
#print(url)
for i in range(page_num):
params = {'orinum': '10', 'cursor': cid}
html = get_html(url, params)
cid = parse_page(infolist, html)
print_comment_list(infolist)
save_to_txt(infolist, 'time.txt')
main()
二.原始数据处理部分 1.华尔街日报的小时得用转换成为正常小时 time.py# coding=gbk
import csv
import time
csvFile = open("data.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
#print(csvRow)
f = open("time.txt",'r',encoding='utf-8')
for line in f:
csvRow = int(line)
#print(csvRow)
timeArray = time.localtime(csvRow)
csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
print(csvRow)
csvRow = csvRow.split()
writer.writerow(csvRow)
f.close()
csvFile.close()
2.华尔街日报具体内容只读csv CD.py# coding=gbk
import csv
csvFile = open("content.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
f = open("content.txt",'r',encoding='utf-8')
for line in f:
csvRow = line.split()
writer.writerow(csvRow)
f.close()
csvFile.close()
3.统计资料一天各个小时段内的华尔街日报数 py.py# coding=gbk
import csv
from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile:
reader = csv.reader(csvfile)
data1 = [str(row[1])[0:2] for row in reader]
print(data1)
print(type(data1))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data1)
rst = []
for item in set_seq:
rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数
rst.sort()
print(type(rst))
print(rst)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
4.统计资料除此以外华尔街日报数 py1.py# coding=gbk
import csv
from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile:
reader = csv.reader(csvfile)
data1 = [str(row[0]) for row in reader]
#print(data1)
print(type(data1))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data1)
rst = []
for item in set_seq:
rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数
rst.sort()
print(type(rst))
print(rst)
with open("time1.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
三. 原始科研人员原始科研人员不足之处:涉及到了词云绘出,条形,梯形,点心绘出,后三者是对华尔街日报小时与主演占比的数据分析,然而百度的华尔街日报小时是以小时得用的形式揭示,所以要进行转换成,如此一来去统计资料出处意到每一次,最后,新加了对华尔街日报具体内容的人性数据分析。
1.创作词云绘出wc.py
import numpy as np
import re
import jieba
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image
# 上面的包自己安装,不会的就百度
f = open('content.txt', 'r', encoding='utf-8') # 这是原始数据源,也就是想要聚合词云的原始数据
txt = f.read() # 复制到邮件
f.close() # 关闭邮件,其实用with就好,但是懒得改了
# 如果是文里面的话,需要用做jieba过去时,分完如此一来次也可以自己处理下如此一来聚合词云
newtxt = re.sub("[A-Za-z0-9!\%[],。]", "", txt)
print(newtxt)
words = jieba.lcut(newtxt)
img = Image.open(r'wc.jpg') # 想要要搞得菱形
img_array = np.array(img)
# 相关固定式,里这个collocations固定式可以消除重复
wordcloud = WordCloud(
background_color="white",
width=1080,
height=960,
font_path="../文悦新青年.otf",
max_words=150,
scale=10,#清晰度
max_font_size=100,
mask=img_array,
collocations=False).generate(newtxt)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.to_file('wc.png')
轮廓绘出:wc.jpg
在这里放入绘出片描述
词云绘出:result.png (出处:这里要把字母过滤上来)
2.创作除此以外华尔街日报数条形绘出 DrawBar.py# encoding: utf-8
import csv
import pyecharts.options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
class DrawBar(object):
"""手绘柱形绘出类"""
def 短时init短时(self):
"""创建柱状绘出比如说,并设为宽高和音乐风格"""
self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.LIGHT))
def add_x(self):
"""为绘出形去除X轴承原始数据"""
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
self.bar.add_xaxis(
xaxis_data=x,
def add_y(self):
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
"""为绘出形去除Y轴承原始数据,可去除多条"""
self.bar.add_yaxis( # 第一个Y轴承原始数据
series_, # Y轴承原始数据名称
y_axis=y1, # Y轴承原始数据
label_opts=opts.LabelOpts(is_show=True,color="black"), # 设为标签
bar_max_width='100px', # 设为柱子最大宽度
def set_global(self):
"""设为绘出形的一个系统属性"""
#self.bar(width=2000,height=1000)
self.bar.set_global_opts(
title_opts=opts.TitleOpts( # 设为标题
title='雪里面悍刀行近日华尔街日报统计资料',title_textstyle_opts=opts.TextStyleOpts(font_size=35)
),
tooltip_opts=opts.TooltipOpts( # 提示框固定式项(鼠标移往到绘出形上时揭示的过道)
is_show=True, # 是否揭示提示框
trigger="axis", # 会有类型(axis旋转轴承会有,鼠标移往到会有一条竖直X轴承的实线跟随鼠标移往动,并揭示提示信息)
axis_pointer_type="cross" # 指示器类型(cross而会聚合两条分别竖直X轴承和Y轴承的虚线,不启用trigger才会揭示完全)
),
toolbox_opts=opts.ToolboxOpts(), # 辅助工具箱固定式项(什么都不填默认开启所有辅助工具)
def draw(self):
"""手绘绘出形"""
self.add_x()
self.add_y()
self.set_global()
self.bar.render('../Html/DrawBar.html') # 将绘出手绘到 test.html 邮件内,可在浏览器打开
def run(self):
"""执行函数"""
self.draw()
if 短时name短时 == '短时main短时':
app = DrawBar()
app.run()
效果绘出:DrawBar.html
3.创作近日华尔街日报数点心绘出 pie_pyecharts.pyimport csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from random import randint
from pyecharts.globals import ThemeType
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
x = [str(row[0]) for row in reader]
print(x)
with open('time1.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [float(row[1]) for row in reader]
print(y1)
num = y1
lab = x
Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#默认900,600
.set_global_opts(
title_opts=opts.TitleOpts(title="雪里面悍刀行近日华尔街日报统计资料",
title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(
pos_top="10%", pos_left="1%",# 绘出例右边缩减
),)
.add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出
.add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出
.add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出
).render('pie_pyecharts.html')
效果绘出
4.创作观看小时列车运行华尔街日报统计资料点心绘出 pie_pyecharts3.py# coding=gbk
import csv
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from sympy.combinatorics import Subset
from wordcloud import WordCloud
from pyecharts.charts import Pie
from random import randint
with open(/data.csv') as csvfile:
reader = csv.reader(csvfile)
data2 = [int(row[1].strip('')[0:2]) for row in reader]
#print(data2)
print(type(data2))
#先替换成不可数取得seq里面的所有金属元素,消除重复重构
set_seq = set(data2)
list = []
for item in set_seq:
list.append((item,data2.count(item))) #去除金属元素及出处意到倍数
list.sort()
print(type(list))
#print(list)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
for i in list: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面
writer.writerow(i)
n = 4 #分成n一组
m = int(len(list)/n)
list2 = []
for i in range(0, len(list), m):
list2.append(list[i:i+m])
print("凌晨 : ",list2[0])
print("上午 : ",list2[1])
print("下午 : ",list2[2])
print("晚上 : ",list2[3])
with open('time2.csv') as csvfile:
reader = csv.reader(csvfile)
y1 = [int(row[1]) for row in reader]
print(y1)
n =6
groups = [y1[i:i + n] for i in range(0, len(y1), n)]
print(groups)
x=['凌晨','上午','下午','晚上']
y1=[]
for y1 in groups:
num_sum = 0
for groups in y1:
num_sum += groups
str_name1 = '点'
num = y1
lab = x
Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#默认900,600
.set_global_opts(
title_opts=opts.TitleOpts(title="雪里面悍刀行观看小时列车运行华尔街日报统计资料"
, title_textstyle_opts=opts.TextStyleOpts(font_size=30)),
legend_opts=opts.LegendOpts(
pos_top="8%", # 绘出例右边缩减
),
.add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出
.add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出
.add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出
).render('pie_pyecharts3.html')
效果绘出
04 总结1. 本文参考概述了如何爬取百度预告片华尔街日报并进行可视化数据分析,读者可以自行动手尝试。
2. 本文十分较难桃子进行练手。
3. 本文供努力学习参考,不做它用。
。大连白癜风哪里治疗好武汉哪里能治疗白癜风
成都哪家医院看白癜风
-
华友钴业:预计2022年1-6月初盈利,净利润同比增49.86%至77.11%
华友钴业2022-07-12发布新闻利润预告,预计2022年1-6月利润22亿元至26亿元,工业产值上年增49.86%至77.11%。发函里理解本次利润增减的因素为:
-
藏格矿业:预计2022年1-6月盈利,净利润同比续427.55%至450%
藏格矿山2022-07-12发布净资产预告,预计2022年1-6月盈利23.5亿元至24.5亿元,去年同期一季度减427.55%至450%。核定中解释本次净资产变动的原因为:p
- 08-23心骑士:“气宗”如何解锁,神殿何处激活?老玩家为您答疑解惑
- 08-23克明药品:预计2022年1-6月盈利,净利润同比增80.89%至171.33%
- 08-23北斗星通:预计2022年1-6月盈利,净利润同比增10.9%至28.7%
- 08-23融资丨「PIX Moving」完成A1轮融资,东洋TIS株式会社独家战略投资
- 08-23中信证券:汽车零部件赛道挖掘 建议注目5G网络、遥控泊车等
- 08-23APP七天粥,真的有必要吗?
- 08-23《谭谈交通网络》全网下架,谭sir面临千万索赔?
- 08-23多次纠结后,我为什么还是备用了抖音
- 08-23不再挣扎?分析师随之“认清形势”纷纷下调美股收益预期
- 08-237月12日投资避雷针:10天9板电力蟠龙停牌核查