1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
| # -*- coding:utf-8 -*- # __author__ = "Rex" # Version:1.0.0
import sys import os if __name__ == '__main__': sys.path.insert(0, os.path.abspath(os.curdir)) import settings import jieba import jieba.analyse import pandas as pd import xlrd from collections import Counter import re
DOC_PATH = os.path.join(settings.DATA_FILE_PATH, "MCD_FENCI.xlsx") USERDICT = os.path.join(settings.DATA_FILE_PATH, "userdict.txt") STOP_WORDS = os.path.join(settings.DATA_FILE_PATH, "stopwords.txt") RESULT_FILE = os.path.join(settings.DATA_FILE_PATH, "FENCI.xlsx")
jieba.load_userdict(USERDICT) # 可自定义词语,避免拆错,例如麦咖啡、香骨鸡腿 jieba.analyse.set_stop_words(STOP_WORDS) # 停止词,取自 # https://github.com/dongxiexidian/Chinese/blob/master/stopwords.dat
# sheet和列的映射 SHEET_COLUMN_MAP = { 'Overall__1':'Overall Comment', 'Friendliness__2':'Friendliness issue - "Other"', 'Fast__3':'Fast issue - "Other"', 'Quality__4':['Quality item - "Other"', 'Quality issue Other'], 'Cleanliness__5':'Cleanliness issue - "Other"', 'Accuracy__6':'Accuracy issue - "Other"', 'EOTF__7':'EOTF issue - "Other"', }
class FENCI(object): def __init__(self): self.book = xlrd.open_workbook(DOC_PATH) self.sheets = self.book.sheets() self.seg_list = [] def fenci(self): # 获取每个sheet,获取对应列,进行分词 for sheet in self.sheets: column_name = SHEET_COLUMN_MAP.get(sheet.name) if isinstance(column_name, list): for col_name in column_name: current_df_series = pd.read_excel(DOC_PATH, sheet.name, encoding='utf-8').dropna()[col_name] self.append_list(current_df_series) else: current_df_series = pd.read_excel(DOC_PATH, sheet.name, encoding='utf-8').dropna()[column_name] self.append_list(current_df_series) re = Counter(filter(self.not_empty, self.seg_list)) result_df = pd.DataFrame([re]).T result_df.columns=['count'] result_df.sort_values('count', inplace=True ,ascending=False) result_df.to_excel(RESULT_FILE, encoding='utf-8') def not_empty(self, s): # 去除标点,空格 s= re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),s) return s and s.strip() def append_list(self, series): # 使用jieba.analyse.extract_tags的方式,能去掉不少 “了”,“的”之类的词语,提取关键 for se in series: self.seg_list.extend(list(jieba.analyse.extract_tags(se))) if __name__=='__main__': obj = FENCI() obj.fenci()
|