词云图制作
词云图制作
一、什么是词云
这就是词云。
“词云”的概念最早是美国西北大学新闻学副教授、新媒体专业主任里奇•戈登( Rich Gordon )提出的。词云( Word Cloud ),又称文字云、标签云( Tag Cloud )、关键词云( Keyword Cloud ),是文本数据的一种可视化展现方式,它一般是由文本数据中提取的词汇组成某些彩色图形。词云图的核心价值在于以高频关键词的可视化表达来传达大量文本数据背后的有价值的信息。
二、基础技术
此处是以《滕王阁序》为文本进行的实验
1、词根拆解
import jieba.posseg as pseg# 需要制作图云的文本位置
text_file = open("滕王阁序.txt", encoding="utf-8").read()
# 词根拆解对象,对象格式为 词、词性
words = pseg.cut(text_file, '')
jieba可以按照文本的词缀将全文本按照词性进行拆分
将words输出查看
for word, flag in words:print(f'word: {word}, flag: {flag}')
格式如下
word: 豫章, flag: ns
word: 故, flag: n
word: 郡, flag: n
flag的部分值如下
flag | 含义 |
---|---|
n | 名词 |
x | 标点符号 |
v | 动词 |
ns | 地名 |
2、获取高频词
将拆分的词根放到一个list,按照出现的次数排列,获取前两个
from collections import Counter
report_words = []
for word, flag in words:report_words.append(word)
result = Counter(report_words).most_common(2)
结果如下
[('帝子', 2), ('豫章', 1)]
3、设置停用词
文本中可能会出现大量无用词汇(如:你、我、他),需要将这些词汇排除在外,避免对结果造成干扰。
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
stopwords.update(["你, "我", "他"])
wordcloud = WordCloud(.....stopwords=stopwords, # 停用词......)
4、设置图云形状
有时为了展示的形状规格化,会找个图片让词云按照制定图片的轮廓进行展示
import PIL.Image as Image
import numpy as np
from wordcloud import WordCloud
# egg.png 为图片的全路径
background = Image.open("egg.png").convert('RGB')
mask = np.array(background)
wordcloud = WordCloud(.....mask =mask, # 掩膜......)
(此处暂未明晰为什么要转为np数组,翻看源码的时候发现默认值是这样的)、
# recompute integral image
if self.mask is None:img_array = np.asarray(img_grey)
else:img_array = np.asarray(img_grey) + boolean_mask
5、设置字体和图云颜色
from wordcloud import WordCloud
# 字体样式路径(需要自己去下载)
font_path = r"./STLITI.TTF"
# 设置字体大小
max_font_size = 200
min_font_size = 10# 建立颜色数组,可更改颜色
color_list = ['#FF274B', '#37A2DA', '#FD666D', '#67E0E3']
# 调用颜色数组
colormap = colors.ListedColormap(color_list)
wordcloud = WordCloud(.....font_path=font_path, # 字体路径colormap=colormap, # 字体颜色max_font_size=max_font_size, # 最大字体大小min_font_size=min_font_size, # 最小字体大小......)
6、WordCloud的参数
参数 | 说明 |
---|---|
scale | 输出清晰度 |
font_path | 自定义字体所在路径 |
colormap | 字体颜色 |
width | 输出图片宽度 |
height | 输出图片高度 |
background_color | 图片背景颜色 |
stopwords | 停用词 |
mask | 掩膜 |
max_font_size | 最大字体大小 |
min_font_size | 最小字体大小 |
三、完整案例演示
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import jieba.posseg as pseg
from collections import Counter
import PIL.Image as Image
from matplotlib import colorsfile_path = r'E:\MYCODE\PYECHARTS\滕王阁序.txt'stop_words = ["的", "我"]
image_path = r"E:\MYCODE\PYECHARTS\egg.png"
color_list = ['#FF274B', '#37A2DA', '#FD666D', '#67E0E3']
target_path = r"E:\MYCODE\PYECHARTS\wordcloud.png"def split_word(file_path, word_length=None, word_flag=None, frequencylimit=None):"""将文本中的内容拆分成词根:param file_path: 文件路径:param word_length: 拆分结果中词的长度(长度小于参数值的词将被舍弃),不做设置赋值为 None:param word_flag: 词性 (n:名称、v:动词、x标点符号),不做设置赋值为 None:param frequencylimit: 频率限制,词的出现个数超过频率限制才返回。不做设置赋值为 None:return:"""# 需要制作图云的文本位置text_file = open(file_path, encoding="utf-8").read()# 词根拆解对象,对象格式为 词、词性words = pseg.cut(text_file, '')# 按指定长度和词性提取词report_words = []for word, flag in words:useword = wordif word_length is not None:useword = Noneif len(word) > word_length:useword = wordif word_flag is not None:useword = Noneif word_flag in flag:useword = wordif useword is not None:report_words.append(useword)# 统计高频词汇if frequencylimit is not None:result = Counter(report_words).most_common(frequencylimit)else:result = report_words# 将词汇统计结果转化为字典word_dict = dict(result)return word_dictdef get_word_cloud(word_dict, stop_words, image_path, color_list, target_path, font_path=None, other_field=None, show=False):"""获取词云图:param word_dict: 词云字典:param stop_words: 停用词列表:param image_path: 参照图片位置(用于设置图云形状):param font_path: 字体路径:param color_list: 颜色列表:param target_path: 目标图片路径:param other_field: 参考 default_field:param show: 是否展示:return:"""default_field = {'scale': 4, 'width': 1600, 'height': 900, 'background_color': 'white', 'max_font_size': 200,'min_font_size': 10}if other_field is not None:for field, value in other_field.items():if value is not None:default_field[field] = value# 设置停用词stopwords = set(STOPWORDS)stopwords.update(stop_words)# 设置png掩膜(需要设置的图云形状图片路径)background = Image.open(image_path).convert('RGB')mask = np.array(background)# 设置字体样式路径if font_path is None:font_path = r"./STLITI.TTF"# 建立颜色数组,可更改颜色if color_list is None:color_list = ['#FF274B', '#37A2DA', '#FD666D', '#67E0E3']# 调用颜色数组colormap = colors.ListedColormap(color_list)# 生成词云wordcloud = WordCloud(scale=default_field['scale'],font_path=font_path,colormap=colormap,width=default_field['width'],height=default_field['height'],background_color=default_field['background_color'],stopwords=stopwords, # 停用词mask=mask, # 掩膜max_font_size=default_field['max_font_size'],min_font_size=default_field['min_font_size'])wordcloud.generate_from_frequencies(word_dict)if show:# 使用 matplotlib 显示词云plt.imshow(wordcloud, interpolation='bilinear')plt.axis('off')plt.show()# 保存词云图wordcloud.to_file(target_path)if __name__ == '__main__':# 词根拆解word_dict = split_word(file_path=file_path, word_length=2, word_flag='n', frequencylimit=2000)print(word_dict)# 生成词云图get_word_cloud(word_dict=word_dict, stop_words=stop_words, image_path=image_path, color_list=color_list, target_path=target_path, show=True)
运行结果为第一章节的图片