HarvestText

HarvestText : A Toolkit for Text Mining and Preprocessing

HarvestText

2018[]

matplotlib

()

README

:

- [](#)
	- 
- [](#)
    - URL, email, 
- [](#)
	-  
- [](#)
	- 
- [()](./examples/entity_discover/entity_discover.ipynb)
    - [](#)[](./examples/entity_discover/entity_discover.ipynb)
- [](#)
	- 
- [](#)
	- IT
- [](#)
	- 
- [](#)
	- 
- [()](#)
	- 
- [](#)
    - TextTiling/
- [](#)
	- 
- [](#)
    -

- - Textrank
- - Textrank, tfidf

pip

pip install --upgrade harvesttext

setup.py:

python setup.py install

from harvesttext import HarvestText
ht = HarvestText()

# 
pip install pattern
# python <= 3.8
pip install pyhanlp

para = ""
entity_mention_dict = {'':['',''],'':['',''],'':[''],'':[''],'':[''],'':['']}
entity_type_dict = {'':'','':'','':'','':'','':'','':''}
ht.add_entities(entity_mention_dict,entity_type_dict)
print("\nSentence segmentation")
print(ht.seg(para,return_sent=True))    # return_sent=False

print("\nPOS tagging with entity types")
for word, flag in ht.posseg(para):
	print("%s:%s" % (word, flag),end = " ")

: :uj : :c : :uj : :x :r :v :ns :a :uj : :x :r :d :v : : :ul :x :r :v :n :m :x :d :v :n :uj : :d :v :ul :d

for span, entity in ht.entity_linking(para):
	print(span, entity)

[0, 2] ('', '##') [3, 5] ('', '##') [6, 8] ('', '##') [9, 11] ('', '##') [19, 21] ('', '##') [26, 28] ('', '##') [28, 31] ('', '##') [47, 49] ('', '##')

print(ht.cut_sentences(para))

['', '']

""keep_all=Trueel_keep_all()

filter_el_with_rule()

"""A""B"xx/yyxx/yy linking_strategy()

@emailhtml %20

print("")
ht0 = HarvestText()
# 
text1 = "@QXM:[][] //@QXM:[good][good]"
print("@")
print("", text1)
print("", ht0.clean_text(text1))


@
 @QXM:[][] //@QXM:[good][good]

# URL
text1 = "## ....http://t.cn/8FLopdQ"
print("URL")
print("", text1)
print("", ht0.clean_text(text1, remove_url=True))

URL
 ## ....http://t.cn/8FLopdQ
 ## ....

# 
text1 = "[email protected]"
print("")
print("", text1)
print("", ht0.clean_text(text1, email=True))


 [email protected]

# URL
text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com"
print("URL")
print("", text1)
print("", ht0.clean_text(text1, norm_url=True, remove_url=False))

URL
 www.%E4%B8%AD%E6%96%87%20and%20space.com
 www. and space.com

text1 = "www. and space.com"
print("URL[request]")
print("", text1)
print("", ht0.clean_text(text1, to_url=True, remove_url=False))

URL[request]
 www. and space.com
 www.%E4%B8%AD%E6%96%87%20and%20space.com

# HTML
text1 = "&lt;a c&gt;&nbsp;&#x27;&#x27;"
print("HTML")
print("", text1)
print("", ht0.clean_text(text1, norm_html=True))

HTML
 &lt;a c&gt;&nbsp;&#x27;&#x27;
 <a c> ''

# 
text1 = ""
print("")
print("", text1)
print("", ht0.clean_text(text1, t2s=True))

# markdown
text1 = "[HarvestText : A Toolkit for Text Mining and Preprocessing](https://github.com/blmoistawinde/HarvestText)"
print("markdown")
print("", text1)
print("", ht0.clean_text(text1, t2s=True))

markdown
 [HarvestText : A Toolkit for Text Mining and Preprocessing](https://github.com/blmoistawinde/HarvestText)
 HarvestText : A Toolkit for Text Mining and Preprocessing

pyhanLP

ht0 = HarvestText()
sent = ""
print(ht0.named_entity_recognition(sent))

{'': '', '': '', '': ''}

pyhanLP

ht0 = HarvestText()
para = ""
entity_mention_dict = {'': ['', ''], "":[""]}
entity_type_dict = {'': '', "":""}
ht0.add_entities(entity_mention_dict, entity_type_dict)
for arc in ht0.dependency_parse(para):
    print(arc)
print(ht0.triple_extraction(para))

[0, '', '', '', 3]
[1, '', 'u', '', 0]
[2, '', '', '', 3]
[3, '', '', '', 4]
[4, '', 'v', '', -1]
[5, '', 'ns', '', 8]
[6, '', 'd', '', 8]
[7, '', 'u', '', 6]
[8, '', 'n', '', 4]
[9, '', 'w', '', 4]

print(ht0.triple_extraction(para))

[['', '', '']]

V0.7tolerance

def entity_error_check():
    ht0 = HarvestText()
    typed_words = {"":[""]}
    ht0.add_typed_words(typed_words)
    sent0 = ""
    print(sent0)
    print(ht0.entity_linking(sent0, pinyin_tolerance=0))
    """
    
    [([0, 2], ('', '##')), [(3, 5), ('', '##')]]
    """
    sent1 = ""
    print(sent1)
    print(ht0.entity_linking(sent1, pinyin_tolerance=1))
    """
    
    [([0, 2], ('', '##')), [(3, 5), ('', '##')]]
    """
    sent2 = ""
    print(sent2)
    print(ht0.entity_linking(sent2, char_tolerance=1))
    """
    
    [([0, 2], ('', '##')), [(3, 5), ('', '##')]]
    """
    sent3 = ""
    print(sent3)
    print(ht0.get_linking_mention_candidates(sent3, pinyin_tolerance=1, char_tolerance=1))
    """
    
    ('', defaultdict(<class 'list'>, {(0, 2): {''}, (3, 5): {''}}))
    """

print("\nsentiment dictionary")
sents = ["",
      "",
      "",
      ""]
sent_dict = ht.build_sent_dict(sents,min_times=1,pos_seeds=[""],neg_seeds=[""])
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))

sentiment dictionary :1.000000 :0.000000 :-1.000000

print("\nsentence sentiment")
sent = ""
print("%f:%s" % (ht.analyse_sent(sent),sent))

0.600000:

SO-PMI[0,1][-1,1]scale

print("\nsentiment dictionary using default seed words")
docs = ["",
        "",
    "",
    ""
   ]
# scale: [-1,1]
# pos_seeds, neg_seeds, get_qh_sent_dict()
print("scale=\"0-1\", 100.5")
sent_dict = ht.build_sent_dict(docs,min_times=1,scale="0-1")
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))
print("%f:%s" % (ht.analyse_sent(docs[0]), docs[0]))
print("%f:%s" % (ht.analyse_sent(docs[1]), docs[1]))

sentiment dictionary using default seed words
scale="0-1", 100.5
:1.000000
:0.153846
:0.000000
0.449412:
0.364910:

print("scale=\"+-1\", 0")
sent_dict = ht.build_sent_dict(docs,min_times=1,scale="+-1")
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))
print("%s:%f" % ("",sent_dict[""]))
print("%f:%s" % (ht.analyse_sent(docs[0]), docs[0]))
print("%f:%s" % (ht.analyse_sent(docs[1]), docs[1]))

scale="+-1", 0
:1.000000
:0.000000
:-1.000000
0.349305:
-0.159652:

add_entities

docs = ["",
		"",
		"",
		""]
inv_index = ht.build_index(docs)
print(ht.get_entity_counts(docs, inv_index))  # 
# {'': 3, '': 2, '': 2}

print(ht.search_entity("", docs, inv_index))  # 
# ['', '', '']

print(ht.search_entity(" ", docs, inv_index))  # 
# ['']

# 
subdocs = ht.search_entity("## ", docs, inv_index)
print(subdocs)  # 
# ['', '']
inv_index2 = ht.build_index(subdocs)
print(ht.get_entity_counts(subdocs, inv_index2, used_type=[""]))  # 
# {'': 2, '': 1}

(networkx) (networkx.Graph)

# 
ht.add_new_entity("", "", "")
docs = ["",
		""]
G = ht.build_entity_graph(docs)
print(dict(G.edges.items()))
G = ht.build_entity_graph(docs, used_types=[""])
print(dict(G.edges.items()))

build_word_ego_graph()

entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
ht0.add_entities(entity_mention_dict, entity_type_dict)
sanguo1 = get_sanguo()[0]
stopwords = get_baidu_stopwords()
docs = ht0.cut_sentences(sanguo1)
G = ht0.build_word_ego_graph(docs,"",min_freq=3,other_min_freq=2,stopwords=stopwords)

(networkx) Textrank(maxlen)

print("\nText summarization")
docs = ["",
        "",
        "",
        ""]
for doc in ht.get_summary(docs, topK=2):
    print(doc)
print("\nText summarization()")
for doc in ht.get_summary(docs, topK=3, avoid_repeat=True):
    print(doc)

Text summarization



Text summarization()

textrankHarvestTextjiebajieba_tfidf

(example)

# text
print("")
kwds = ht.extract_keywords(text, 5, method="jieba_tfidf")
print("jieba_tfidf", kwds)
kwds = ht.extract_keywords(text, 5, method="textrank")
print("textrank", kwds)


jieba_tfidf ['', '', '', '', '']
textrank ['', '', '', '', '']

CSL.ipynb textrank4zh CSL

	P@5	R@5	F@5
textrank4zh	0.0836	0.1174	0.0977
ht_textrank	0.0955	0.1342	0.1116
ht_jieba_tfidf	0.1035	0.1453	0.1209

demo

get_qh_sent_dict: http://nlp.csai.tsinghua.edu.cn/site2/index.php/13-sms
get_baidu_stopwords: https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
get_qh_typed_words: THUNLP http://thuocl.thunlp.org/ ['IT', '', '', '', '', '', '', '', '']
get_english_senti_lexicon:
get_jieba_dict: jieba

def load_resources():
	from harvesttext.resources import get_qh_sent_dict,get_baidu_stopwords,get_sanguo,get_sanguo_entity_dict
    sdict = get_qh_sent_dict()              # {"pos":[...],"neg":[...]}
    print("pos_words:",list(sdict["pos"])[10:15])
    print("neg_words:",list(sdict["neg"])[5:10])
    
    stopwords = get_baidu_stopwords()
    print("stopwords:", list(stopwords)[5:10])

    docs = get_sanguo()                 # 
    print("16:\n",docs[-1][-16:])
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    print(" ",entity_mention_dict[""])
    print(" ",entity_type_dict[""])
    print(" ", entity_type_dict[""])
    print(" ", entity_type_dict[""])
load_resources()

pos_words: ['', '', '', '', '']
neg_words: ['', '', '', '', '']
stopwords: ['apart', '', '', 'probably', 'think']
16:
 
  ['', '', '']

def using_typed_words():
    from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    sentence = "THUOCL"
    print(sentence)
    print(ht0.posseg(sentence,stopwords=stopwords))
using_typed_words()

THUOCL
[('THUOCL', 'eng'), ('', 'IT'), ('', 'm'), ('', 'nz'), ('', 'n'), ('', 'n'), ('', 'v'), ('', 'b'), ('', 'n'), ('', 'n'), ('', ''), ('', 'v'), ('', 'n'), ('', 'IT'), ('', 'n')]

IT,

para = ""
#(pd.DataFrame)
new_words_info = ht.word_discover(para)
#new_words_info = ht.word_discover(para, threshold_seeds=[""])  
new_words = new_words_info.index.tolist()
print(new_words)

[""]

auto_param=False

:param max_word_len: 
:param min_freq: 
:param min_entropy: 
:param min_aggregation:

min_entropy = np.log(length) / 10
min_freq = min(0.00005, 20.0 / length)
min_aggregation = np.sqrt(length) / 15

http://www.matrix67.com/blog/archives/5044

sort_by='score'

def new_word_register():
    new_words = ["","666"]
    ht.add_new_words(new_words)   # ""
    ht.add_new_entity("", mention0="", type0="")  # 
    print(ht.seg("666", return_sent=True))
    for word, flag in ht.posseg("666"):
        print("%s:%s" % (word, flag), end=" ")

:r : :v :ud :d 666:

# find_with_rules()
from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
text0 = "Pythonrequests"
ht0 = HarvestText()

found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="")
print(found_entities)
print(ht0.posseg(text0))

{'Python', 'requests'}
[('', 'r'), ('', 'v'), ('Python', ''), ('', 'x'), ('', 'c'), ('requests', ''), ('', 'n'), ('', 'd'), ('', 'v'), ('', 'n')]

TextTiling/

ht0 = HarvestText()
text = """13919
201932419
80

185"""
print("[5]")
print(text+"\n")
print("[3]")
predicted_paras = ht0.cut_paragraphs(text, num_paras=3)
print("\n".join(predicted_paras)+"\n")

seq_chars``align_boundary=False``examples/basic.py``cut_paragraph()

print("")
text2 = extract_only_chinese(text)
predicted_paras2 = ht0.cut_paragraphs(text2, num_paras=5, seq_chars=10, align_boundary=False)
print("\n".join(predicted_paras2)+"\n")

from harvesttext import loadHT,saveHT
para = ""
saveHT(ht,"ht_model1")
ht2 = loadHT("ht_model1")

# 
ht2.clear()
print("cut with cleared model")
print(ht2.seg(para))

naiveKGQA.py

QA = NaiveKGQA(SVOs, entity_type_dict=entity_type_dict)
questions = ["","","","",
			 "",""]
for question0 in questions:
	print(""+question0)
	print(""+QA.answer(question0))

HarvestText

#  "Until the Day" by JJ Lin
test_text = """
In the middle of the night. 
Lonely souls travel in time.
Familiar hearts start to entwine.
We imagine what we'll find, in another life.  
""".lower()
ht_eng = HarvestText(language="en")
sentences = ht_eng.cut_sentences(test_text)  # 
print("\n".join(sentences))
print(ht_eng.seg(sentences[-1]))             # []
print(ht_eng.posseg(sentences[0], stopwords={"in"}))
# 
sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=["familiar"], neg_seeds=["lonely"],
                                   min_times=1, stopwords={'in', 'to'})
print("sentiment analysis")
for sent0 in sentences:
    print(sent0, "%.3f" % ht_eng.analyse_sent(sent0))
# 
print("Segmentation")
print("\n".join(ht_eng.cut_paragraphs(test_text, num_paras=2)))

# 
# from harvesttext.resources import get_english_senti_lexicon
# sent_lexicon = get_english_senti_lexicon()
# sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=sent_lexicon["pos"], neg_seeds=sent_lexicon["neg"], min_times=1)

in the middle of the night.
lonely souls travel in time.
familiar hearts start to entwine.
we imagine what we'll find, in another life.

['we', 'imagine', 'what', 'we', "'ll", 'find', ',', 'in', 'another', 'life', '.']
[('the', 'DET'), ('middle', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('night', 'NOUN'), ('.', '.')]

sentiment analysis
in the middle of the night. 0.000
lonely souls travel in time. -1.600
familiar hearts start to entwine. 1.600
we imagine what we'll find, in another life. 0.000

Segmentation
in the middle of the night. lonely souls travel in time. familiar hearts start to entwine.
we imagine what we'll find, in another life.

@misc{zhangHarvestText,
	author = {Zhiling Zhang},
	title = {HarvestText: A Toolkit for Text Mining and Preprocessing},
	journal = {GitHub repository},
	howpublished = {\url{https://github.com/blmoistawinde/HarvestText}},
	year = {2023}
}

issuesStar~

repo

snownlp

pyhanLP

funNLP

ChineseWordSegmentation

EventTriplesExtraction

textrank4ZH

Package Rankings

Top 4.85% on Pypi.org