Python Jieba分词

安装

代码对 Python 2/3 均兼容
全自动安装:easy_install jieba 或者 pip install jieba / pip3 install jieba
半自动安装:先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 python setup.py install
手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
通过 import jieba 来引用

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding:utf-8 -*-  
# __author__ = "Rex"
# Version:1.0.0

import sys
import os
if __name__ == '__main__':
sys.path.insert(0, os.path.abspath(os.curdir))
import settings
import jieba
import jieba.analyse
import pandas as pd
import xlrd
from collections import Counter
import re

DOC_PATH = os.path.join(settings.DATA_FILE_PATH, "MCD_FENCI.xlsx")
USERDICT = os.path.join(settings.DATA_FILE_PATH, "userdict.txt")
STOP_WORDS = os.path.join(settings.DATA_FILE_PATH, "stopwords.txt")
RESULT_FILE = os.path.join(settings.DATA_FILE_PATH, "FENCI.xlsx")

jieba.load_userdict(USERDICT) # 可自定义词语,避免拆错,例如麦咖啡、香骨鸡腿
jieba.analyse.set_stop_words(STOP_WORDS) # 停止词,取自 # https://github.com/dongxiexidian/Chinese/blob/master/stopwords.dat

# sheet和列的映射
SHEET_COLUMN_MAP = {
'Overall__1':'Overall Comment',
'Friendliness__2':'Friendliness issue - "Other"',
'Fast__3':'Fast issue - "Other"',
'Quality__4':['Quality item - "Other"', 'Quality issue Other'],
'Cleanliness__5':'Cleanliness issue - "Other"',
'Accuracy__6':'Accuracy issue - "Other"',
'EOTF__7':'EOTF issue - "Other"',
}

class FENCI(object):
def __init__(self):
self.book = xlrd.open_workbook(DOC_PATH)
self.sheets = self.book.sheets()
self.seg_list = []

def fenci(self):
# 获取每个sheet,获取对应列,进行分词
for sheet in self.sheets:
column_name = SHEET_COLUMN_MAP.get(sheet.name)
if isinstance(column_name, list):
for col_name in column_name:
current_df_series = pd.read_excel(DOC_PATH, sheet.name, encoding='utf-8').dropna()[col_name]
self.append_list(current_df_series)
else:
current_df_series = pd.read_excel(DOC_PATH, sheet.name, encoding='utf-8').dropna()[column_name]
self.append_list(current_df_series)

re = Counter(filter(self.not_empty, self.seg_list))
result_df = pd.DataFrame([re]).T
result_df.columns=['count']
result_df.sort_values('count', inplace=True ,ascending=False)
result_df.to_excel(RESULT_FILE, encoding='utf-8')

def not_empty(self, s):
# 去除标点,空格
s= re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),s)
return s and s.strip()

def append_list(self, series):
# 使用jieba.analyse.extract_tags的方式,能去掉不少 “了”,“的”之类的词语,提取关键
for se in series:
self.seg_list.extend(list(jieba.analyse.extract_tags(se)))

if __name__=='__main__':
obj = FENCI()
obj.fenci()

自定义词典

例如:香骨鸡腿,如果不加入自定义词典的话,那么就会拆为 香骨和鸡腿,放在一起再拆的话结果就是香骨鸡腿

1
jieba.load_userdict(USERDICT) # 可自定义词语,避免拆错,例如麦咖啡、香骨鸡腿

停止词

在这个里面的词语不会被拆分出来,也就是不会出现

1
jieba.analyse.set_stop_words(STOP_WORDS) # 停止词,取自 # https://github.com/dongxiexidian/Chinese/blob/master/stopwords.dat