NLP之spacy

PyDictionary可以找打词的同义词和翻译

from PyDictionary import PyDictionary
dictionary=PyDictionary()
dictionary.meaning("indentation")

dictionary=PyDictionary("hotel","ambush","nonchalant","perceptive")
dictionary.getMeanings() '''This will return meanings as dictionaries'''
dictionary.getSynonyms()

vocabulary 提供翻译同义词，meaning等功能。

spacy 模型的地址：https://github.com/explosion/spacy-models，每个模型详细介绍https://spacy.io/usage/models

下载模型pip install -U spacypython -m spacy download en_core_web_sm.

基本操作

import spacy
nlp = spacy.load("en_core_web_sm")
#级联顺序是
sents_list = []#将一段拆分为句子
for snt in nlp(paper['content']).sents:
    sents_list.append(snt)
#将句子拆分为词
print([token.text for token in sents_list[0]])

import re
lambda x:re.sub(r'<script.*?/script>','',x.replace('\r\n','').replace('\n',''))#去掉js
lambda x:re.sub(r'<style.*?/style>','',x)#去掉css
import html2text
lambda x:html2text.html2text(x)#将html格式转化为文本格式

pos的缩写代表的意思

POS Tag	Description	Example
0	CC	coordinating conjunction	and
1	CD	cardinal number	1, third
2	DT	determiner	the
3	EX	existential there	there, is
4	FW	foreign word	d’hoevre
5	IN	preposition or subordinating conjunction	in, of, like
6	JJ	adjective	big
7	JJR	adjective, comparative	bigger
8	JJS	adjective, superlative	biggest
9	LS	list marker	1)
10	MD	modal	could, will
11	NN	noun, singular or mass	door
12	NNS	noun plural	doors
13	NNP	proper noun, singular	John
14	NNPS	proper noun, plural	Vikings
15	PDT	predeterminer	both the boys
16	POS	possessive ending	friend‘s
17	PRP	personal pronoun	I, he, it
18	PRP$	possessive pronoun	my, his
19	RB	adverb	however, usually, naturally, here, good
20	RBR	adverb, comparative	better
21	RBS	adverb, superlative	best
22	RP	particle	give up
23	TO	to	to go, to him
24	UH	interjection	uhhuhhuhh
25	VB	verb, base form	take
26	VBD	verb, past tense	took
27	VBG	verb, gerund or present participle	taking
28	VBN	verb, past participle	taken
29	VBP	verb, sing. present, non-3d	take
30	VBZ	verb, 3rd person sing. present	takes
31	WDT	wh-determiner	which
32	WP	wh-pronoun	who, what
33	WP$	possessive wh-pronoun	whose
34	WRB	wh-abverb	where, when

对应的表格https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

形态morphology ：就是词语的词根不会变化，而多了前缀和后缀修饰。比方说一个动词有过去式，现在进行时和过去完成时，这些都认为他有一个词根。

每个句子(nlp对象)包含单词，单词由是morphology对象

from spacy.morphology import Morphology
morphology = Morphology(strings)

feats = "Feat1=Val1|Feat2=Val2"
hash = nlp.vocab.morphology.add(feats)
assert nlp.vocab.morphology.get(hash) == feats

d = Morphology.feats_to_dict("Feat1=Val1|Feat2=Val2")
assert d == {"Feat1": "Val1", "Feat2": "Val2"}

f = Morphology.dict_to_feats({"Feat1": "Val1", "Feat2": "Val2"})
assert f == "Feat1=Val1|Feat2=Val2"

kindred

这是一个专门处理医学文献的NLP库。

kindred提供根据pmid下载文章的函数kindred.pubtator.load(pmids),但是这个很不好用，会报错RuntimeError: Unable to download PubTator data after 3 retries,就是pubmed

textacy

数据预处理模块

>>> from textacy import preprocessing
>>> preprocessing.normalize.whitespace(preprocessing.remove.punctuation(text))[:80]
'Since the so called statistical revolution in the late 1980s and mid 1990s much '
#实例化sapcy doc
>>> doc = textacy.make_spacy_doc(text,'en_core_web_sm')
>>> doc._.preview
'Doc(85 tokens: "Since the so-called "statistical revolution" in...")'
doc._.meta

#基础操作
corpus = textacy.Corpus("en_core_web_sm", data=records)#实例化corpus
corpus.add(texts);corpus.remove(lambda doc: doc._.meta.get("speaker_name") == "Rick Santorum")#添加和删除
>>> match_func = lambda doc: doc._.meta.get("speaker_name") == "Bernie Sanders"#根据函数切片
>>> for doc in corpus.get(match_func, limit=3):
...     print(doc._.preview)
Doc(159 tokens: "Mr. Speaker, 480,000 Federal employees are work...")
Doc(336 tokens: "Mr. Speaker, I thank the gentleman for yielding...")
Doc(177 tokens: "Mr. Speaker, if we want to understand why in th...")
#进行简单的统计分析
>>> corpus.n_docs, corpus.n_sents, corpus.n_tokens
(56, 1771, 41573)
>>> word_counts = corpus.word_counts(by="lemma_")
>>> sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:5]
[('-PRON-', 2553), ('people', 215), ('year', 148), ('Mr.', 139), ('$', 137)]
>>> word_doc_counts = corpus.word_doc_counts(by="lemma_", weighting="freq")


list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=False))
#这里的3是window大小

#找句子中间的关键词的算法	
from textacy.extract import keyterms as kt
kt.textrank(doc, normalize="lemma", topn=10)
kt.sgrank(doc, ngrams=(1, 2, 3, 4), normalize="lower", topn=0.1)#感觉sgrank会更有效率一些
assert hh=='Prostaglandin E2 (PGE2) interacts with tyrosine kinases receptor signaling in both tumor and stromal cells supporting tumor progression.'
textacy.extract.keyterms.sgrank(hh)
>>>[('tyrosine kinase receptor', 0.4217308427210148),
 ('prostaglandin E2', 0.16628512295344672),
 ('stromal cell', 0.11715540305665428),
 ('tumor progression', 0.11348381294556975),
 ('PGE2', 0.08474522854970501)]
textacy.extract.keyterms.textrank(hh)
>>>[('tyrosine kinase receptor', 0.14942925583327316),
 ('tumor progression', 0.13997039482865523),
 ('stromal cell', 0.09483809672182696),
 ('prostaglandin E2', 0.0900362546238346),
 ('PGE2', 0.05244106830477502)]
textacy.extract.keyterms.yake(hh)
>>>[('prostaglandin E2', 0.5032452124985934),
 ('PGE2', 0.5566998721996959),
 ('tumor', 0.578886787744827),
 ('progression', 0.7021147266509096),
 ('tyrosine', 0.8578316139578024),
 ('kinase', 0.8578316139578024),
 ('receptor', 0.8578316139578024),
 ('stromal', 0.8578316139578024),
 ('cell', 0.8578316139578024),
 ('tumor progression', 1.8105676281183678)]
textacy.extract.keyterms.scake(hh)
>>>[('prostaglandin E2', 120.0),
 ('tyrosine kinase receptor', 26.88888888888889),
 ('tumor progression', 21.864661654135336),
 ('PGE2', 20.0),
 ('stromal cell', 9.705882352941178)]

#读取句子每行一个
texts = textacy.io.read_text('~/Desktop/burton-tweets.txt', lines=True)

## 分析Corpus 这里理解为doc是一个句子，corpus是一句话
#实例化corpus
>>> corpus = textacy.Corpus("en", data=records)
>>> corpus
Corpus(1240 docs, 857548 tokens)
>>> corpus.n_docs, corpus.n_sents, corpus.n_tokens
(1240, 34530, 857548)
#import textacy.vsm 这个导入不了 在新版本中已经换了
textacy.representations.vectorizers.Vectorizer

主要的部分有: information extraction,text statistics,document similarity,data augmentation.

对extract部分比较感兴趣说一下这里的API

textacy.extract.triples.subject_verb_object_triples(doc)#可以找到句子中的主谓宾语
[i for i in  subject_verb_object_triples(doc)]
[SVOTriple(subject=[It], verb=[was, founded], object=[Romans]),
 SVOTriple(subject=[who], verb=[named], object=[it])]

#提取缩略词，在我看来不太准
hh=nlp('Prostaglandin E2 (PGE2) interacts with tyrosine kinases receptor signaling in both tumor and stromal cells supporting tumor progression.')
list(textacy.extract.acros.acronyms(hh))#para known_acro_defs如果有不会去找新的字典
[E2, PGE2]
textacy.extract.acros.acronyms_and_definitions(hh)
{'E2': '', 'PGE2': ''}
[textacy.extract.acros.is_acronym(i) for i in ['like','E2']]
[False, True]

主题模型是提取文本，计算抽象主题相似性的一种统计学模型。用了sklearn的 LSA, LDA, and NMF models 三个模型并将他们用到矢量化文本。

wenyuhao 发布于 2021-09-04

kindred

textacy

wenyuhao