本章技术要点:

  • 读取一个txt, 然后按照n等份整理为dataframe
  • 词频统计&绘制词云图
  • 假设内容分成20份(带顺序),可视化情绪变化趋势
  • 假设内容分成20份(带顺序),可视化相邻内容的相似度变化趋势

点击下载代码


一、读入小说

把自传数据等分成20份,默认每一份代表一个时期。

import pandas as pd


def read_txt(file, num_segments, encoding='utf-8'):
    # 读取txt文件
    with open(file, "r", encoding=encoding) as f:
        text = f.read()
    
    # 获取文本的总长度和每一段的长度
    total_length = len(text)
    segment_length = total_length // num_segments
    
    # 将文本分割成指定数量的段落
    segments = []
    for i in range(num_segments):
        start = i * segment_length
        end = (i + 1) * segment_length
        if i == num_segments - 1:
            end = total_length
        segment = text[start:end]
        segments.append(segment)

    # 将内容存储在数据框中
    df = pd.DataFrame(segments, columns=["doc"])
    
    return df


#分成20份
df = read_txt(file='data/三体.txt', num_segments=20)
df


二、清洗数据

  1. 剔除非中文字符
  2. 去除停用词
import jieba
import re
import cntext as ct
#1.8.4版本cntext
#如需安装 pip3 install cntext==1.8.4

stopwords = ct.load_pkl_dict('STOPWORDS.pkl')['STOPWORDS']['chinese']

def clean(doc):
    #剔除非中文字符
    doc = re.sub('[^\u4e00-\u9fa5]', '', doc)
    words = jieba.lcut(doc)
    words = [w for w in words if w not in stopwords]
    return words


clean(doc='质子带有的电荷也会转移到展开后形成的黑洞中,我们就能用电磁力捕捉和控制住它。')
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/y0/4gqxky0s2t94x1c1qhlwr6100000gn/T/jieba.cache
Loading model cost 0.428 seconds.
Prefix dict has been built successfully.

['质子', '带有', '电荷', '转移', '展开', '形成', '黑洞', '电磁力', '捕捉', '控制', '住']

df['words'] = df.doc.apply(clean)
df.head()


三、可视化

3.1 词数变化

小说分成了20部分,每部分字符长度是相同的。但是因为章节内容不同,用词的复杂度会有变化,即单位长度内词语平均数据量。

from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK

word_nums = df.words.apply(lambda words: len(words))
    
line = Line()

line.add_xaxis(xaxis_data=list(range(20)))

line.add_yaxis("词语数变化", 
               word_nums, 
               label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="三体词数变化(单位字符长度内)"))

line.load_javascript()
line.render('三体词数变化(单位字符长度内).html')
line.render_notebook()


3.2 词云图

import jieba
from pyecharts import options as opts
from pyecharts.charts import WordCloud
import re

text = ''.join(df.doc)
text = re.sub('[^\u4e00-\u9fa5]+', '', text)
words = jieba.lcut(text)
words = [w for w in words if len(w)>1]

records = []
for w in set(words):
    freq = words.count(w)
    records.append((w, freq))
    

records = sorted(records, key=lambda k:k[1], reverse=True)
wordcounts = [(w, str(f)) for w,f in records]

wc = WordCloud()
wc.add(series_name="", data_pair=wordcounts, word_size_range=[20, 100])
wc.set_global_opts(
        title_opts=opts.TitleOpts(title="三体词云图", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
    
    
wc.render("三体词云图.html")  #存储位置
wc.render_notebook()


3.3 情绪可视化

使用DUTIR细粒度情感词典,绘制7种情绪在三体小说中的变化趋势


def emotion_count(text):
    res = ct.sentiment(text=text, 
             diction=ct.load_pkl_dict('DUTIR.pkl')['DUTIR'], 
             lang='chinese')
    return pd.Series(res)


text = '被光速物体摧毁,但认为光粒可能是银河系中的一种自然现象。虽然迄今为止没有观察到第二个光粒现'
emotion_count(text=text)

Run

乐_num            0
好_num            0
怒_num            0
哀_num            1
惧_num            0
恶_num            0
惊_num            0
stopword_num     9
word_num        23
sentence_num     2
dtype: int64

emotion_df = df.doc.apply(emotion_count).T
emotion_df


from pyecharts import options as opts
from pyecharts.charts import Line

line_chart = Line()

line_chart.add_xaxis(xaxis_data=emotion_df.columns.tolist())

for linename in emotion_df.index:
    linedata = emotion_df.loc[linename].tolist()
    line_chart.add_yaxis(series_name = linename, 
                         y_axis=linedata,
                         label_opts=opts.LabelOpts(formatter="{b}", 
                                                   position="right", 
                                                   is_show=False))

line_chart.set_global_opts(
        title_opts=opts.TitleOpts(title='三体情绪变化',
                                  pos_top="5%",
                                  pos_right='30%'),
        xaxis_opts=opts.AxisOpts(name="Year"),
        yaxis_opts=opts.AxisOpts(name="Value"),
        legend_opts=opts.LegendOpts(pos_right=True, orient='vertical'),

    )

line_chart.render_notebook()


3.4 相似度可视化

剧情变化程度,使用cosine相似度度量。

from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
import cntext as ct

sims = []
for idx in range(len(df)):
    try:
        pre_text = df.loc[idx-1, 'doc']
        after_text = df.loc[idx, 'doc']
        sim = ct.cosine_sim(text1=pre_text, text2=after_text)
        sims.append(sim)
    except:
        sims.append(0.4)
    

    


line = Line()
line.add_xaxis(xaxis_data=list(range(20)))
line.add_yaxis("相似度", 
               sims, 
               label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="三体小说剧情相似度变化趋势"))

line.load_javascript()
line.render('三体小说剧情相似度变化趋势.html')
line.render_notebook()

有了相似度变化序列数据, 可以使用 ruptures库, 找到序列变化的时间点(大致的章节位置), 可以参考 使用 Ruptures 识别时间序列数据中的变化点


广而告之