您现在的位置是：首页 > Python > 正文

Python

周杰伦超话微博数据热点分析

张文迪2019-07-24Python13801

周杰伦超话最近刷屏了，发现科赛网有超话的6000条数据，正好拿来分析一下#安装包!pip install pyecharts我们先安装echarts的python包，这里默认安装的

周杰伦超话最近刷屏了，发现科赛网有超话的6000条数据，正好拿来分析一下

#安装包!pip install pyecharts

我们先安装echarts的python包，这里默认安装的是1.xx版本

# 加载包
import pandas as pd
import numpy as np
import jieba
#导入词云的包
from wordcloud import WordCloud
import matplotlib.pyplot as plt

然后我们加载数据

# 加载数据
df = pd.read_csv('../input/data7857/weibo.csv')
df.head()

我们可以看到如下结果：

	rid	用户名称	微博等级	微博内容	微博转发量	微博评论量	微博点赞	发布时间
0	1	道門網-正统道教网站	蓝v	周杰伦超话#周杰伦超话# 为了杰伦，豁出去了，现场开光，道友们速速打榜道門網-正统道教网站...	1.7万	3323	4.3万	14小时前
1	2	科学未来人	金v	#周杰伦超话第一# 对抗虚假流量，大概只能靠周董。\n\n2000年他出道的时候，我初二。对...	711	1181	1万	昨天 08:55
2	3	会火	金v	#周杰伦超话# 自周杰伦被人质疑不做数据，数据不好后，周杰伦粉丝自发开始学习超话规则，一步步...	674	3485	12.3万	20-Jul
3	4	换个名字比见权鸡涌还难	普通用户	@周杰伦超话要自己争气啊#周杰伦超话#	转发	评论	赞	刚刚
4	5	洗手间歌唱家	微博达人	#周杰伦粉丝被迫营业##周杰伦超话#\n成都的雨下的好黑人哦\n比杰伦心里下得雨好大。“心里...	转发	评论	赞	刚刚

下面我们修改一些df的列名：；

df2=df.copy()
df2.columns = ['id', 'user_name', 'weibo_level', 'weibo_content', 'forward','comments', 'thumbs', 'time']
df2.head()

Out:

	id	user_name	weibo_level	weibo_content	forward	comments	thumbs	time
0	1	道門網-正统道教网站	蓝v	周杰伦超话#周杰伦超话# 为了杰伦，豁出去了，现场开光，道友们速速打榜道門網-正统道教网站...	1.7万	3323	4.3万	14小时前
1	2	科学未来人	金v	#周杰伦超话第一# 对抗虚假流量，大概只能靠周董。\n\n2000年他出道的时候，我初二。对...	711	1181	1万	昨天 08:55
2	3	会火	金v	#周杰伦超话# 自周杰伦被人质疑不做数据，数据不好后，周杰伦粉丝自发开始学习超话规则，一步步...	674	3485	12.3万	20-Jul
3	4	换个名字比见权鸡涌还难	普通用户	@周杰伦超话要自己争气啊#周杰伦超话#	转发	评论	赞	刚刚
4	5	洗手间歌唱家	微博达人	#周杰伦粉丝被迫营业##周杰伦超话#\n成都的雨下的好黑人哦\n比杰伦心里下得雨好大。“心里...	转发	评论	赞	刚刚

查看数据情况：

#查看df形状
df2.shape

输出：

(6004, 8)

查看基本信息：

#查看基本信息
df2.info()

Out:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6004 entries, 0 to 6003
Data columns (total 8 columns):
id               6004 non-null int64
user_name        6004 non-null object
weibo_level      6004 non-null object
weibo_content    6004 non-null object
forward          6004 non-null object
comments         6004 non-null object
thumbs           6004 non-null object
time             6004 non-null object
dtypes: int64(1), object(7)
memory usage: 375.3+ KB

#去重
df3=df2.drop_duplicates(['user_name','weibo_content'])
df3.shape

Out:

(774, 8)

发现数据中有大量的重复评论，说明粉丝力争的心态很强呀！

#统计不同类型用户数量
user_type=df3.weibo_level.value_counts()
#评论次数情况
df_comments=df.copy()
comments_result=df_comments.replace(['评论'],['0'],inplace =True)
df_comments.columns = ['id', 'user_name', 'weibo_level', 'weibo_content', 'forward','comments', 'thumbs', 'time']
#统计不同类型用户评论数量
user_comments=df_comments.comments.value_counts()
#导入pyecharts作图的包
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from pyecharts.charts import Bar
bar =(
    Bar(opts.InitOpts(width = '1000px',height = '500px',theme=ThemeType.INFOGRAPHIC,
    js_host="https://cdn.kesci.com/lib/pyecharts_assets/")) #新建柱状图
    .add_xaxis(["普通用户","微博达人","黄V","蓝V","金V"])
    .add_yaxis("用户数",list(user_type),is_selected = True)
    .add_yaxis("评论数",list(user_comments),is_selected = True)
    .set_global_opts(
         title_opts = opts.TitleOpts(title="不同类型用户数及评论情况"), # 设置title
            xaxis_opts= opts.AxisOpts(
                splitline_opts=opts.SplitLineOpts(is_show=True)
                ),  # 设置x轴
            yaxis_opts= opts.AxisOpts(
                splitarea_opts=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1))
                ),  # 设置y轴
            toolbox_opts = opts.ToolboxOpts(is_show = True),  # 设置工具箱
        )
)
bar.render_notebook()

Out：

周杰伦超话微博数据热点分析

让我们再看看点赞转发的情况：

#点赞次数情况
df_thumbs=df.copy()
thumbs_result=df_thumbs.replace(['万'],['0'],inplace =True)
df_thumbs.columns = ['id', 'user_name', 'weibo_level', 'weibo_content', 'forward','comments', 'thumbs', 'time']
#转发次数情况
df_forward=df.copy()
forward_result=df_forward.replace(['万'],['0'],inplace =True)
df_forward.columns = ['id', 'user_name', 'weibo_level', 'weibo_content', 'forward','comments', 'thumbs', 'time']
#统计不同类型用户被点赞次数
user_thumbs=df_thumbs.thumbs.value_counts()
#统计不同类型用户被转发
user_forward=df_forward.forward.value_counts()
bar1 =(
    Bar(opts.InitOpts(width = '1000px',height = '500px',theme=ThemeType.INFOGRAPHIC,
    js_host="https://cdn.kesci.com/lib/pyecharts_assets/")) #新建柱状图
    .add_xaxis(["普通用户","微博达人","黄V","蓝V","金V"])
    .add_yaxis("转发数",list(user_thumbs),is_selected = True)
    .add_yaxis("点赞数",list(user_forward),is_selected = True)
    .set_global_opts(
         title_opts = opts.TitleOpts(title="不同类型用户点赞及转发情况"), # 设置title
            xaxis_opts= opts.AxisOpts(
                splitline_opts=opts.SplitLineOpts(is_show=True)
                ),  # 设置x轴
            yaxis_opts= opts.AxisOpts(
                splitarea_opts=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1))
                ),  # 设置y轴
            toolbox_opts = opts.ToolboxOpts(is_show = True),  # 设置工具箱
        )
)
bar1.render_notebook()

Out:

周杰伦超话微博数据热点分析

我们可以发现，普通用户的传播能力很强，虽然普通用户的数量相对较多，但是不可避免的长尾效应，使得普通用户的传播能力更强。

下面我们接着进行文本分析：

#提取评论并保存成txt文件，待分词作图
df_content=df3[['weibo_content']]
df_content.to_csv('../work/input.txt', sep='\t',index=False, header=None)

#读取并分词
jieba.load_userdict('../work/dict.txt')  
filename='../work/output.txt'
fileneedCut='../work/input.txt'
fn=open(fileneedCut,"r",encoding='utf-8')
fm=open(filename,"w+",encoding='utf-8')
for line in fn.readlines():
    word=jieba.cut(line,cut_all=True)
    words=",".join(word)
    for w in words:
        fm.write(str(w))
fm.close()
fn.close()
#删除一些词汇
from string import digits
with open(filename, 'r',encoding='utf-8') as fpr:
    content = fpr.read()
content = content.replace('周杰伦', '')
content = content.replace('哈哈', '')
content = content.replace('超话', '')
content = content.replace('被迫', '')
content = content.replace('营业', '')
content = content.replace('第一', '')
content = content.replace('粉丝', '')
remove_digits = str.maketrans('', '', digits)
content = content.translate(remove_digits)
with open('../work/result.txt', 'w',encoding='utf-8') as fpw:
    fpw.write(content)

#画图
fk = open('../work/result.txt','r',encoding='utf-8').read()
font = r'../work/simfang.ttf'
wordcloud = WordCloud(
#       mask=coloring,
        background_color="white", #设置背景为白色，默认为黑色
        width=1000,              #设置图片的宽度
        height=1000,              #设置图片的高度
        margin=10,          #设置图片的边缘 
        font_path=font,     #设置字体 
        relative_scaling=0.5           
        ).generate(fk)
# 绘制图片
plt.imshow(wordcloud)
# 消除坐标轴
plt.axis("off")
# 展示图片
plt.show()