当前位置:首页 >> 金融
金融

用 Python 爬取了《雪中悍刀行》数据,终于告诉它为什么这么火了

2025-08-02 12:19

p>url = '' + vid + '/comment/v2'

#print(url)

for i in range(page_num):

params = {'orinum': '10', 'cursor': cid}

html = get_html(url, params)

cid = parse_page(infolist, html)

print_comment_list(infolist)

save_to_txt(infolist, 'content.txt')

main()

2.爬取华尔街日报小时字符:sp.py

import requests

import re

import random

def get_html(url, params):

uapools = [

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'

]

thisua = random.choice(uapools)

headers = {"User-Agent": thisua}

r = requests.get(url, headers=headers, params=params)

r.raise_for_status()

r.encoding = r.apparent_encoding

r.encoding = 'utf-8' # 未加此句出处意到乱码

return r.text

def parse_page(infolist, data):

commentpat = '"time":"(.*?)"'

lastpat = '"last":"(.*?)"'

commentall = re.compile(commentpat, re.S).findall(data)

next_cid = re.compile(lastpat).findall(data)[0]

infolist.append(commentall)

return next_cid

def print_comment_list(infolist):

j = 0

for page in infolist:

print('第' + str(j + 1) + '页')

commentall = page

for i in range(0, len(commentall)):

print(commentall[i] + '')

j += 1

def save_to_txt(infolist, path):

fw = open(path, 'w+', encoding='utf-8')

j = 0

for page in infolist:

#fw.write('第' + str(j + 1) + '页')

commentall = page

for i in range(0, len(commentall)):

fw.write(commentall[i] + '')

j += 1

fw.close()

def main():

infolist = []

vid = '7579013546';

cid = "0";

page_num =3000

url = '' + vid + '/comment/v2'

#print(url)

for i in range(page_num):

params = {'orinum': '10', 'cursor': cid}

html = get_html(url, params)

cid = parse_page(infolist, html)

print_comment_list(infolist)

save_to_txt(infolist, 'time.txt')

main()

二.原始数据处理部分 1.华尔街日报的小时得用转换成为正常小时 time.py

# coding=gbk

import csv

import time

csvFile = open("data.csv",'w',newline='',encoding='utf-8')

writer = csv.writer(csvFile)

csvRow = []

#print(csvRow)

f = open("time.txt",'r',encoding='utf-8')

for line in f:

csvRow = int(line)

#print(csvRow)

timeArray = time.localtime(csvRow)

csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

print(csvRow)

csvRow = csvRow.split()

writer.writerow(csvRow)

f.close()

csvFile.close()

2.华尔街日报具体内容只读csv CD.py

# coding=gbk

import csv

csvFile = open("content.csv",'w',newline='',encoding='utf-8')

writer = csv.writer(csvFile)

csvRow = []

f = open("content.txt",'r',encoding='utf-8')

for line in f:

csvRow = line.split()

writer.writerow(csvRow)

f.close()

csvFile.close()

3.统计资料一天各个小时段内的华尔街日报数 py.py

# coding=gbk

import csv

from pyecharts import options as opts

from sympy.combinatorics import Subset

from wordcloud import WordCloud

with open('../Spiders/data.csv') as csvfile:

reader = csv.reader(csvfile)

data1 = [str(row[1])[0:2] for row in reader]

print(data1)

print(type(data1))

#先替换成不可数取得seq里面的所有金属元素,消除重复重构

set_seq = set(data1)

rst = []

for item in set_seq:

rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数

rst.sort()

print(type(rst))

print(rst)

with open("time2.csv", "w+", newline='', encoding='utf-8') as f:

writer = csv.writer(f, delimiter=',')

for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面

writer.writerow(i)

with open('time2.csv') as csvfile:

reader = csv.reader(csvfile)

x = [str(row[0]) for row in reader]

print(x)

with open('time2.csv') as csvfile:

reader = csv.reader(csvfile)

y1 = [float(row[1]) for row in reader]

print(y1)

4.统计资料除此以外华尔街日报数 py1.py

# coding=gbk

import csv

from pyecharts import options as opts

from sympy.combinatorics import Subset

from wordcloud import WordCloud

with open('../Spiders/data.csv') as csvfile:

reader = csv.reader(csvfile)

data1 = [str(row[0]) for row in reader]

#print(data1)

print(type(data1))

#先替换成不可数取得seq里面的所有金属元素,消除重复重构

set_seq = set(data1)

rst = []

for item in set_seq:

rst.append((item,data1.count(item))) #去除金属元素及出处意到倍数

rst.sort()

print(type(rst))

print(rst)

with open("time1.csv", "w+", newline='', encoding='utf-8') as f:

writer = csv.writer(f, delimiter=',')

for i in rst: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面

writer.writerow(i)

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

x = [str(row[0]) for row in reader]

print(x)

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

y1 = [float(row[1]) for row in reader]

print(y1)

三. 原始科研人员

原始科研人员不足之处:涉及到了词云绘出,条形,梯形,点心绘出,后三者是对华尔街日报小时与主演占比的数据分析,然而百度的华尔街日报小时是以小时得用的形式揭示,所以要进行转换成,如此一来去统计资料出处意到每一次,最后,新加了对华尔街日报具体内容的人性数据分析。

1.创作词云绘出

wc.py

import numpy as np

import re

import jieba

from wordcloud import WordCloud

from matplotlib import pyplot as plt

from PIL import Image

# 上面的包自己安装,不会的就百度

f = open('content.txt', 'r', encoding='utf-8') # 这是原始数据源,也就是想要聚合词云的原始数据

txt = f.read() # 复制到邮件

f.close() # 关闭邮件,其实用with就好,但是懒得改了

# 如果是文里面的话,需要用做jieba过去时,分完如此一来次也可以自己处理下如此一来聚合词云

newtxt = re.sub("[A-Za-z0-9!\%[],。]", "", txt)

print(newtxt)

words = jieba.lcut(newtxt)

img = Image.open(r'wc.jpg') # 想要要搞得菱形

img_array = np.array(img)

# 相关固定式,里这个collocations固定式可以消除重复

wordcloud = WordCloud(

background_color="white",

width=1080,

height=960,

font_path="../文悦新青年.otf",

max_words=150,

scale=10,#清晰度

max_font_size=100,

mask=img_array,

collocations=False).generate(newtxt)

plt.imshow(wordcloud)

plt.axis('off')

plt.show()

wordcloud.to_file('wc.png')

轮廓绘出:wc.jpg

在这里放入绘出片描述

词云绘出:result.png (出处:这里要把字母过滤上来)

2.创作除此以外华尔街日报数条形绘出 DrawBar.py

# encoding: utf-8

import csv

import pyecharts.options as opts

from pyecharts.charts import Bar

from pyecharts.globals import ThemeType

class DrawBar(object):

"""手绘柱形绘出类"""

def 短时init短时(self):

"""创建柱状绘出比如说,并设为宽高和音乐风格"""

self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.LIGHT))

def add_x(self):

"""为绘出形去除X轴承原始数据"""

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

x = [str(row[0]) for row in reader]

print(x)

self.bar.add_xaxis(

xaxis_data=x,

def add_y(self):

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

y1 = [float(row[1]) for row in reader]

print(y1)

"""为绘出形去除Y轴承原始数据,可去除多条"""

self.bar.add_yaxis( # 第一个Y轴承原始数据

series_, # Y轴承原始数据名称

y_axis=y1, # Y轴承原始数据

label_opts=opts.LabelOpts(is_show=True,color="black"), # 设为标签

bar_max_width='100px', # 设为柱子最大宽度

def set_global(self):

"""设为绘出形的一个系统属性"""

#self.bar(width=2000,height=1000)

self.bar.set_global_opts(

title_opts=opts.TitleOpts( # 设为标题

title='雪里面悍刀行近日华尔街日报统计资料',title_textstyle_opts=opts.TextStyleOpts(font_size=35)

),

tooltip_opts=opts.TooltipOpts( # 提示框固定式项(鼠标移往到绘出形上时揭示的过道)

is_show=True, # 是否揭示提示框

trigger="axis", # 会有类型(axis旋转轴承会有,鼠标移往到会有一条竖直X轴承的实线跟随鼠标移往动,并揭示提示信息)

axis_pointer_type="cross" # 指示器类型(cross而会聚合两条分别竖直X轴承和Y轴承的虚线,不启用trigger才会揭示完全)

),

toolbox_opts=opts.ToolboxOpts(), # 辅助工具箱固定式项(什么都不填默认开启所有辅助工具)

def draw(self):

"""手绘绘出形"""

self.add_x()

self.add_y()

self.set_global()

self.bar.render('../Html/DrawBar.html') # 将绘出手绘到 test.html 邮件内,可在浏览器打开

def run(self):

"""执行函数"""

self.draw()

if 短时name短时 == '短时main短时':

app = DrawBar()

app.run()

效果绘出:DrawBar.html

3.创作近日华尔街日报数点心绘出 pie_pyecharts.py

import csv

from pyecharts import options as opts

from pyecharts.charts import Pie

from random import randint

from pyecharts.globals import ThemeType

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

x = [str(row[0]) for row in reader]

print(x)

with open('time1.csv') as csvfile:

reader = csv.reader(csvfile)

y1 = [float(row[1]) for row in reader]

print(y1)

num = y1

lab = x

Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#默认900,600

.set_global_opts(

title_opts=opts.TitleOpts(title="雪里面悍刀行近日华尔街日报统计资料",

title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(

pos_top="10%", pos_left="1%",# 绘出例右边缩减

),)

.add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出

.add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出

.add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出

).render('pie_pyecharts.html')

效果绘出

4.创作观看小时列车运行华尔街日报统计资料点心绘出 pie_pyecharts3.py

# coding=gbk

import csv

from pyecharts import options as opts

from pyecharts.globals import ThemeType

from sympy.combinatorics import Subset

from wordcloud import WordCloud

from pyecharts.charts import Pie

from random import randint

with open(/data.csv') as csvfile:

reader = csv.reader(csvfile)

data2 = [int(row[1].strip('')[0:2]) for row in reader]

#print(data2)

print(type(data2))

#先替换成不可数取得seq里面的所有金属元素,消除重复重构

set_seq = set(data2)

list = []

for item in set_seq:

list.append((item,data2.count(item))) #去除金属元素及出处意到倍数

list.sort()

print(type(list))

#print(list)

with open("time2.csv", "w+", newline='', encoding='utf-8') as f:

writer = csv.writer(f, delimiter=',')

for i in list: # 对于每悄悄的,将这悄悄的每个金属元素分别写在互换的列里面

writer.writerow(i)

n = 4 #分成n一组

m = int(len(list)/n)

list2 = []

for i in range(0, len(list), m):

list2.append(list[i:i+m])

print("凌晨 : ",list2[0])

print("上午 : ",list2[1])

print("下午 : ",list2[2])

print("晚上 : ",list2[3])

with open('time2.csv') as csvfile:

reader = csv.reader(csvfile)

y1 = [int(row[1]) for row in reader]

print(y1)

n =6

groups = [y1[i:i + n] for i in range(0, len(y1), n)]

print(groups)

x=['凌晨','上午','下午','晚上']

y1=[]

for y1 in groups:

num_sum = 0

for groups in y1:

num_sum += groups

str_name1 = '点'

num = y1

lab = x

Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#默认900,600

.set_global_opts(

title_opts=opts.TitleOpts(title="雪里面悍刀行观看小时列车运行华尔街日报统计资料"

, title_textstyle_opts=opts.TextStyleOpts(font_size=30)),

legend_opts=opts.LegendOpts(

pos_top="8%", # 绘出例右边缩减

),

.add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#点心绘出

.add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环绘出

.add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#贝克曼绘出

).render('pie_pyecharts3.html')

效果绘出

04

总结

1. 本文参考概述了如何爬取百度预告片华尔街日报并进行可视化数据分析,读者可以自行动手尝试。

2. 本文十分较难桃子进行练手。

3. 本文供努力学习参考,不做它用。

大连白癜风哪里治疗好
武汉哪里能治疗白癜风
成都哪家医院看白癜风

上一篇: LHF100/166-2橡胶气囊多种多样负载的北坡

下一篇: 早新闻:微软4361亿收购动视暴雪;个位打通IP产业链

友情链接