## 一、读入小说
把自传数据等分成20份，默认每一份代表一个时期。

In [1]:
import pandas as pd


def read_txt(file, num_segments, encoding='utf-8'):
    # 读取txt文件
    with open(file, "r", encoding=encoding) as f:
        text = f.read()
    
    # 获取文本的总长度和每一段的长度
    total_length = len(text)
    segment_length = total_length // num_segments
    
    # 将文本分割成指定数量的段落
    segments = []
    for i in range(num_segments):
        start = i * segment_length
        end = (i + 1) * segment_length
        if i == num_segments - 1:
            end = total_length
        segment = text[start:end]
        segments.append(segment)

    # 将内容存储在数据框中
    df = pd.DataFrame(segments, columns=["doc"])
    
    return df


#分成20份
df = read_txt(file='data/三体.txt', num_segments=20)
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/三体.txt'

<br>

## 二、清洗数据
1. 剔除非中文字符
2. 去除停用词

In [7]:
import jieba
import re
import cntext as ct
#1.8.4版本cntext
#如需安装 pip3 install cntext==1.8.4

stopwords = ct.load_pkl_dict('STOPWORDS.pkl')['STOPWORDS']['chinese']

def clean(doc):
    #剔除非中文字符
    doc = re.sub('[^\u4e00-\u9fa5]', '', doc)
    words = jieba.lcut(doc)
    words = [w for w in words if w not in stopwords]
    return words


clean(doc='质子带有的电荷也会转移到展开后形成的黑洞中，我们就能用电磁力捕捉和控制住它。')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/y0/4gqxky0s2t94x1c1qhlwr6100000gn/T/jieba.cache
Loading model cost 0.428 seconds.
Prefix dict has been built successfully.


['质子', '带有', '电荷', '转移', '展开', '形成', '黑洞', '电磁力', '捕捉', '控制', '住']

In [8]:
df['words'] = df.doc.apply(clean)
df.head()

Unnamed: 0,doc,words
0,\n------------\n\n第1章 科学边界(1)\n\n 恋上你看书网 63...,"[第章, 科学, 边界, 恋上你, 看书, 网, 最快, 更新, 三体, 全集, 最新, 章..."
1,后，见做饭的人拎着擀面杖出去，到屋前的那条小河中‘乒乓’几棒子，就打上几条大鱼来……多富饶的...,"[后见, 做饭, 拎, 擀面杖, 出去, 屋前, 那条, 小河, 乒乓, 棒子, 几条, 大..."
2,了第三个球体，情况发生了令我震惊的变化。前面说过，任何图形在我的意识深处都是数字化的，前面的...,"[第三个, 球体, 情况, 发生, 令, 震惊, 变化, 前面, 图形, 意识, 深处, 数..."
3,问者：基本了解，不清楚的我会提问。\n\n 叶文洁：好的。在接收到外星信息并回信后的当...,"[问, 基本, 了解, 清楚, 我会, 提问, 叶文洁, 接收, 外星, 信息, 回信, 当..."
4,的出现，质子带有的电荷也会转移到展开后形成的黑洞中，我们就能用电磁力捕捉和控制住它。”\n\...,"[出现, 质子, 带有, 电荷, 转移, 展开, 形成, 黑洞, 电磁力, 捕捉, 控制, ..."


<br>

## 三、可视化
### 3.1 词数变化
小说分成了20部分，每部分字符长度是相同的。但是因为章节内容不同，用词的复杂度会有变化，即单位长度内词语平均数据量。

In [17]:
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK

word_nums = df.words.apply(lambda words: len(words))
    
line = Line()

line.add_xaxis(xaxis_data=list(range(20)))

line.add_yaxis("词语数变化", 
               word_nums, 
               label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="三体词数变化(单位字符长度内)"))

line.load_javascript()
line.render('三体词数变化(单位字符长度内).html')
line.render_notebook()

### 3.2 词云图


In [59]:
import jieba
from pyecharts import options as opts
from pyecharts.charts import WordCloud
import cntext as ct

stopwords = ct.load_pkl_dict('STOPWORDS.pkl')['STOPWORDS']['chinese']

text = ''.join(df.doc)
text = re.sub('[^\u4e00-\u9fa5]+', '', text)
words = jieba.lcut(text)
words = [w for w in words if len(w)>1 and w not in stopwords]

records = []
for w in set(words):
    freq = words.count(w)
    records.append((w, freq))
    

records = sorted(records, key=lambda k:k[1], reverse=True)
wordcounts = [(w, str(f)) for w,f in records]

wc = WordCloud()
wc.add(series_name="", data_pair=wordcounts, word_size_range=[20, 100])
wc.set_global_opts(
        title_opts=opts.TitleOpts(title="三体词云图", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
    
    
wc.render("三体词云图.html")  #存储位置
wc.render_notebook()

### 3.3  情绪可视化
使用DUTIR细粒度情感词典，绘制7种情绪在三体小说中的变化趋势

In [27]:

def emotion_count(text):
    res = ct.sentiment(text=text, 
             diction=ct.load_pkl_dict('DUTIR.pkl')['DUTIR'], 
             lang='chinese')
    return pd.Series(res)


text = '被光速物体摧毁，但认为光粒可能是银河系中的一种自然现象。虽然迄今为止没有观察到第二个光粒现'
emotion_count(text=text)

乐_num            0
好_num            0
怒_num            0
哀_num            1
惧_num            0
恶_num            0
惊_num            0
stopword_num     9
word_num        23
sentence_num     2
dtype: int64

In [38]:
emotion_df = df.doc.apply(emotion_count).T
emotion_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
乐_num,187,147,178,176,170,222,227,170,196,189,144,179,206,164,138,240,165,143,183,172
好_num,508,563,669,660,550,649,663,512,670,575,526,508,614,629,462,602,515,518,451,429
怒_num,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
哀_num,81,70,92,106,82,92,91,101,127,80,120,120,129,122,90,147,107,98,83,87
惧_num,72,67,73,52,108,36,48,85,55,54,74,65,73,93,84,61,65,50,61,43
恶_num,382,391,349,349,376,378,335,331,390,296,418,368,393,372,306,318,269,315,269,213
惊_num,28,34,27,29,22,21,26,15,29,27,16,33,24,25,26,31,37,24,25,25
stopword_num,13482,13448,13339,13624,13823,14436,14061,13486,13553,13183,13023,13657,13454,12872,13011,13419,12874,12814,13145,13077
word_num,31960,31748,31496,31327,32009,32315,32761,31885,31745,31721,31415,31639,31278,30533,30735,31602,31386,30778,31306,31640
sentence_num,1353,1208,1241,1191,1176,1169,1289,1126,1142,1168,1051,1154,1161,1051,1075,1176,1183,1120,1166,1292


In [60]:
from pyecharts import options as opts
from pyecharts.charts import Line

line_chart = Line()

line_chart.add_xaxis(xaxis_data=emotion_df.columns.tolist())

for linename in emotion_df.index:
    linedata = emotion_df.loc[linename].tolist()
    line_chart.add_yaxis(series_name = linename, 
                         y_axis=linedata,
                         label_opts=opts.LabelOpts(formatter="{b}", 
                                                   position="right", 
                                                   is_show=False))

line_chart.set_global_opts(
        title_opts=opts.TitleOpts(title='三体情绪变化',
                                  pos_top="5%",
                                  pos_right='30%'),
        xaxis_opts=opts.AxisOpts(name="顺序"),
        yaxis_opts=opts.AxisOpts(name="情感"),
        legend_opts=opts.LegendOpts(pos_right=True, orient='vertical'),

    )

line_chart.render_notebook()

<br>

### 3.4 相似度可视化
剧情变化程度，使用cosine相似度度量。

In [55]:
from pyecharts.charts import Line
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
import cntext as ct

sims = []
for idx in range(len(df)):
    try:
        pre_text = df.loc[idx-1, 'doc']
        after_text = df.loc[idx, 'doc']
        sim = ct.cosine_sim(text1=pre_text, text2=after_text)
        sims.append(sim)
    except:
        sims.append(0.4)
    

    


line = Line()
line.add_xaxis(xaxis_data=list(range(20)))
line.add_yaxis("相似度", 
               sims, 
               label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="三体小说剧情相似度变化趋势"))

line.load_javascript()
line.render('三体小说剧情相似度变化趋势.html')
line.render_notebook()