您现在的位置: 首页 > 网站导航收录 > 百科知识百科知识
python提取文本中的一类词给个思路是什么?
文本,单词,标记python提取文本中的一类词给个思路是什么?
发布时间:2019-02-08加入收藏来源:互联网点击:
python提取文本中的一类词给个思路是什么?
回答于 2019-09-11 08:43:50
回答于 2019-09-11 08:43:50
python实现textrank提取文本关键词
首先参考http://blog.csdn.net/sa14023053/article/details/51713301,
但是我在调试的时候遇到很多坑,记录下来供参考。
环境如下
计算机:win10,32位,i7处理器;
python:Anaconda3(版本3.6);
VSCode:1.13.1;
遇到的问题如下
1、环境配置,由于VSCode版本问题,配置中的launch.json中的Python路径为:
"pythonPath": "${config:python.pythonPath}",(低版本中的为"pythonPath": "${config:python.pythonPath}")2、不知道上述博客中使用的python版本,当我使用3.6版本时遇到一个编码问题、iteritems()问题、str、map等问题
具体见文件;
但是有个缺点就是生成的图全部都是单词,并没有chunk所以需要调整
import sysimport langidimport osimport nltkfrom nltk.tag.api import TaggerIfrom nltk.internals import find_file, find_jar, config_java, java, _java_options, find_jars_within_pathimport itertoolsfrom operator import itemgetterfrom nltk.stem import WordNetLemmatizerimport networkx as nxfrom nltk.collocations import *from nltk.stem.porter import *tagger = nltk.tag.perceptron.PerceptronTagger() wnl = WordNetLemmatizer() colloc_list = [] entity_names = []def filter_for_tags(tagged, tags=['NN', 'NNPS', 'NNP', 'NNS']): return [item for item in tagged if item[1] in tags]def filter_numbers(tagged): return [item for item in tagged if len(item[0]) > 2 and not item[0].isdigit()]def normalize(tagged): return [(item[0].replace('.', ''), item[1]) for item in tagged]def normalize_tags(tagged): return [(item[0], item[1][0:1]) for item in tagged]def lowercase(tagged): return [(w.lower(), t) for (w, t) in tagged]def rstopwords(tagged): return [(w, t) for (w, t) in tagged if not w in nltk.corpus.stopwords.words('english')]def lemmatize(tagged): return [(wnl.lemmatize(item[0]), item[1]) if not ' ' in item[0] else (item[0], item[1]) for item in tagged]def extract_entity_names(t): entity_names = [] if hasattr(t, 'label') and t.label: if t.label() == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_namesdef joincolloc(tagged): tagged1 = [] sw = 0 for i in range(len(tagged) - 1): if sw == 1: sw = 0 continue if (tagged[i], tagged[i + 1]) in colloc_list: sw = 1 if tagged[i][1].startswith('NN') or tagged[i + 1][1].startswith('NN'): tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'NN')) elif tagged[i][1] == 'RB' or tagged[i + 1][1] == 'RB': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'RB')) else: tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], tagged[i][1])) else: tagged1.append(tagged[i]) if len(tagged) > 0: tagged1.append(tagged[len(tagged) - 1]) return tagged1def groupne2(tagged): tagged1 = [] sw = 0 for i in range(len(tagged) - 1): if sw == 1: sw = 0 continue if (tagged[i][0] + ' ' + tagged[i + 1][0]) in entity_names: sw = 1 if tagged[i][1] == 'NNP' or tagged[i + 1][1] == 'NNP': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'NNP')) elif tagged[i][1] == 'NN' or tagged[i + 1][1] == 'NN': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'NN')) elif tagged[i][1] == 'RB' or tagged[i + 1][1] == 'RB': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'RB')) else: tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], tagged[i][1])) else: tagged1.append(tagged[i]) if len(tagged) > 0: tagged1.append(tagged[len(tagged) - 1]) return tagged1def groupne3(tagged): tagged1 = [] sw = 0 for i in range(len(tagged) - 2): if sw == 1: sw = 0 continue if (tagged[i][0] + ' ' + tagged[i + 1][0] + ' ' + tagged[i + 2][0]) in entity_names: sw = 1 if tagged[i][1] == 'NNP' or tagged[i + 1][1] == 'NNP' or tagged[i + 2][1] == 'NNP': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1] [0] + ' ' + tagged[i + 2][0], 'NNP')) elif tagged[i][1] == 'NN' or tagged[i + 1][1] == 'NN' or tagged[i + 2][1] == 'NNP': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1] [0] + ' ' + tagged[i + 2][0], 'NN')) elif tagged[i][1] == 'RB' or tagged[i + 1][1] == 'RB' or tagged[i + 2][1] == 'NNP': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1] [0] + ' ' + tagged[i + 2][0], 'RB')) else: tagged1.append((tagged[i][0] + ' ' + tagged[i + 1] [0] + ' ' + tagged[i + 2][0], tagged[i][1])) else: tagged1.append(tagged[i]) if len(tagged) > 1: tagged1.append(tagged[len(tagged) - 2]) tagged1.append(tagged[len(tagged) - 1]) elif len(tagged) == 1: tagged1.append(tagged[len(tagged) - 1]) return tagged1def joincollocbi(tagged): tagged1 = [] sw = 0 for i in range(len(tagged) - 1): if sw == 1: sw = 0 continue if ' ' in tagged[i][0]: t1 = (tagged[i][0][tagged[i][0].find(' '):].strip(), tagged[i][1]) else: t1 = (tagged[i][0], tagged[i][1]) if ' ' in tagged[i + 1][0]: t2 = (tagged[i + 1][0][:tagged[i + 1] [0].find(' ')].strip(), tagged[i][1]) else: t2 = (tagged[i + 1][0], tagged[i + 1][1]) if (t1, t2) in colloc_list: sw = 1 if tagged[i][1] == 'NNP' or tagged[i + 1][1] == 'NNP': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'NNP')) elif tagged[i][1] == 'NN' or tagged[i + 1][1] == 'NN': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'NN')) elif tagged[i][1] == 'RB' or tagged[i + 1][1] == 'RB': tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], 'RB')) else: tagged1.append((tagged[i][0] + ' ' + tagged[i + 1][0], tagged[i][1])) else: tagged1.append(tagged[i]) if len(tagged) > 0: tagged1.append(tagged[len(tagged) - 1]) return tagged1 blacklist = [] fname = sys.argv[1] articles = os.listdir(fname) FOLDER = 'keywords-' + fname + '-textrank'if not os.path.exists(FOLDER): os.makedirs(FOLDER) tagged = []for article in articles: # articleFile = open(fname+'/' + article, 'r') articleFile = open(fname + '/' + article, 'r', encoding='UTF-8') for linee in articleFile: # line = linee.decode('latin-1') line = linee lang = langid.classify(line.strip()) if not lang[0] == 'en': continue sentences = nltk.sent_tokenize(line.strip()) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [tagger.tag(sentence)for sentence in tokenized_sentences] for sentence in tagged_sentences: tagged.extend(sentence) chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)# for tree in chunked_sentences: #entity_names.extend(extract_entity_names(tree))# entity_names.extend(extract_entity_names(tree)) articleFile.close()# entity_names = set(entity_names)# print(entity_names)bigram_measures = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(tagged) finder.apply_freq_filter(3) colloc_list = finder.nbest(bigram_measures.pmi, 20) # this needs to be tweakedfor article in articles: print('Reading articles/' + article) # articleFile = open(fname + '/' + article, 'r') articleFile = open(fname + '/' + article, 'r', encoding='UTF-8') tagged = [] sentences = [] k = 0 for linee in articleFile: # line = linee.decode('latin-1') line = linee lang = langid.classify(line.strip()) if not lang[0] == 'en': continue sents = nltk.sent_tokenize(line.strip()) tok_sents = [nltk.word_tokenize(sent) for sent in sents] tagged_sents = [tagger.tag(sent) for sent in tok_sents] tagged_sents = [joincolloc(sent) for sent in tagged_sents] tagged_sents = [joincollocbi(sent) for sent in tagged_sents] tagged_sents = [groupne2(sent) for sent in tagged_sents] tagged_sents = [groupne3(sent) for sent in tagged_sents] tagged_sents = [filter_for_tags(sent) for sent in tagged_sents] tagged_sents = [normalize_tags(sent) for sent in tagged_sents] tagged_sents = [normalize(sent) for sent in tagged_sents] tagged_sents = [filter_numbers(sent) for sent in tagged_sents] tagged_sents = [lowercase(sent) for sent in tagged_sents] tagged_sents = [lemmatize(sent) for sent in tagged_sents] tagged_sents = [rstopwords(sent) for sent in tagged_sents] for sent in tagged_sents: tagged.extend(sent) sentences.extend(tagged_sents) gr = nx.MultiGraph() for sentence in sentences: if len(sentence) > 1: for i in range(len(sentence) - 1): for j in range(i + 1, len(sentence)): try: s1 = sentence[i][0] + '/' + sentence[i][1] s2 = sentence[j][0] + '/' + sentence[j][1]# wt = float(1.0)/float(len(sentence)) # if weighting by sentence length is desired wt = 1 gr.add_edge(s1, s2, weight=wt) except (AdditionError, e): pass H = nx.Graph() for u, v, d in gr.edges(data=True): w = d['weight'] if H.has_edge(u, v): H[u][v]['weight'] += w else: H.add_edge(u, v, weight=w) calculated_page_rank = nx.pagerank(H) keyphraseFile = open(FOLDER + '/'+article, 'w') # di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True) di = sorted(calculated_page_rank.items(), key=itemgetter(1), reverse=True) dic = []# for k, g in itertools.groupby(di, key=itemgetter(1)):# try: # w = str(map(itemgetter(0), g)[0])# w = w[:w.find('/')]# if len(w)>2 and w not in blacklist:# if w not in dic:# keyphraseFile.write(w.replace(' ','_') + ':' + str(k)[0:6] + ' ')# dic.append(w)# except:# pass dic = [] for k, g in itertools.groupby(di, key=itemgetter(1)): try: print(k) for item in g: print(item) w = str(item[0]) w = w[0:w.index('/')] if len(w)>2 and w not in blacklist:# if len(w)>2: if w not in dic: keyphraseFile.write(w.replace(' ','_') + ':' + str(k)[0:6] + ' ') dic.append(w) except: pass keyphraseFile.close()上一篇:如何看待专科护理?
下一篇:返回列表
相关链接 |
||
网友回复(共有 0 条回复) |