NeatText a simple NLP package for cleaning textual data and text preprocessing
MIT License
NeatText:a simple NLP package for cleaning textual data and text preprocessing. Simplifying Text Cleaning For NLP & ML
pip install neattext
TextFrame
object. This allows us to do more with our text.>>> import neattext as nt
>> mytext = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊."
>>> docx = nt.TextFrame(text=mytext)
>>> docx.text
"This is the mail [email protected] ,our WEBSITE is https://example.com 😊."
>>>
>>> docx.describe()
Key Value
Length : 73
vowels : 21
consonants: 34
stopwords: 4
punctuations: 8
special_char: 8
tokens(whitespace): 10
tokens(words): 14
>>>
>>> docx.length
73
>>> # Scan Percentage of Noise(Unclean data) in text
>>> d.noise_scan()
{'text_noise': 19.17808219178082, 'text_length': 73, 'noise_count': 14}
>>>
>>> docs.head(16)
'This is the mail'
>>> docx.tail()
>>> docx.count_vowels()
>>> docx.count_stopwords()
>>> docx.count_consonants()
>>> docx.nlongest()
>>> docx.nshortest()
>>> docx.readability()
>>> docx.word_tokens()
>>>
>>> docx.sent_tokens()
>>>
>>> docx.term_freq()
>>>
>>> docx.bow()
>>> docx.normalize()
'this is the mail [email protected] ,our website is https://example.com 😊.'
>>> docx.normalize(level='deep')
'this is the mail examplegmailcom our website is httpsexamplecom '
>>> docx.remove_puncts()
>>> docx.remove_stopwords()
>>> docx.remove_html_tags()
>>> docx.remove_special_characters()
>>> docx.remove_emojis()
>>> docx.fix_contractions()
>>> docx.remove_accents()
>>> docx.remove_non_ascii()
>>> import neattext as nt
>>> docx_df = nt.read_txt('file.txt')
>>> import neattext as nt
>>> docx_df = nt.TextFrame().read_txt('file.txt')
>>> t1 = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊 and it will cost $100 to subscribe."
>>> docx = TextFrame(t1)
>>> result = docx.remove_emails().remove_urls().remove_emojis()
>>> print(result)
'This is the mail ,our WEBSITE is and it will cost $100 to subscribe.'
>>> from neattext.functions import clean_text
>>>
>>> mytext = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊."
>>>
>>> clean_text(mytext)
'mail [email protected] ,our website https://example.com .'
You can remove punctuations,stopwords,urls,emojis,multiple_whitespaces,etc by setting them to True.
You can choose to remove or not remove punctuations by setting to True/False respectively
>>> clean_text(mytext,puncts=True)
'mail example@gmailcom website https://examplecom '
>>>
>>> clean_text(mytext,puncts=False)
'mail [email protected] ,our website https://example.com .'
>>>
>>> clean_text(mytext,puncts=False,stopwords=False)
'this is the mail [email protected] ,our website is https://example.com .'
>>>
>>> clean_text(mytext,stopwords=False)
'this is the mail [email protected] ,our website is https://example.com .'
>>>
>>> clean_text(mytext,urls=False)
'mail [email protected] ,our website https://example.com .'
>>>
>>> clean_text(mytext,urls=True)
'mail [email protected] ,our website .'
>>>
>>> import neattext as nt
>>> mytext = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊. Please don't forget the email when you enter !!!!!"
>>> docx = nt.TextFrame(mytext)
>>> docx.remove_puncts()
TextFrame(text="This is the mail example@gmailcom our WEBSITE is https://examplecom 😊 Please dont forget the email when you enter ")
>>> docx.remove_puncts(most_common=False)
TextFrame(text="This is the mail examplegmailcom our WEBSITE is httpsexamplecom 😊 Please dont forget the email when you enter ")
>>> import neattext as nt
>>> mytext = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊. Please don't forget the email when you enter !!!!!"
>>> docx = nt.TextFrame(mytext)
>>> docx.remove_stopwords(lang='en')
TextFrame(text="mail [email protected] ,our WEBSITE https://example.com 😊. forget email enter !!!!!")
>>> print(docx.remove_emails())
>>> 'This is the mail ,our WEBSITE is https://example.com 😊.'
>>>
>>> print(docx.remove_stopwords())
>>> 'This mail [email protected] ,our WEBSITE https://example.com 😊.'
>>>
>>> print(docx.remove_numbers())
>>> docx.remove_phone_numbers()
>>> docx.remove_btc_address()
>>> docx.remove_special_characters()
>>> print(docx.remove_emojis())
>>> 'This is the mail [email protected] ,our WEBSITE is https://example.com .'
remove_custom_pattern()
function>>> import neattext.functions as nfx
>>> ex = "Last !RT tweeter multiple ṡ"
>>>
>>> nfx.remove_custom_pattern(e,r'&#\d+')
'Last !RT tweeter multiple '
>>> docx.replace_emails()
>>> docx.replace_numbers()
>>> docx.replace_phone_numbers()
>>> t1 = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊 and it will cost $100 to subscribe."
>>> docx = TextCleaner(t1)
>>> result = docx.remove_emails().remove_urls().remove_emojis()
>>> print(result)
'This is the mail ,our WEBSITE is and it will cost $100 to subscribe.'
>>> from neattext import TextExtractor
>>> docx = TextExtractor()
>>> docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊."
>>> docx.extract_emails()
>>> ['[email protected]']
>>>
>>> docx.extract_emojis()
>>> ['😊']
>>> from neattext import TextMetrics
>>> docx = TextMetrics()
>>> docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com 😊."
>>> docx.count_vowels()
>>> docx.count_consonants()
>>> docx.count_stopwords()
>>> docx.word_stats()
>>> docx.memory_usage()
>>> from neattext.functions import clean_text,extract_emails
>>> t1 = "This is the mail [email protected] ,our WEBSITE is https://example.com ."
>>> clean_text(t1,puncts=True,stopwords=True)
>>>'this mail examplegmailcom website httpsexamplecom'
>>> extract_emails(t1)
>>> ['[email protected]']
>>> import neattext.functions as nfx
>>> t1 = "This is the mail [email protected] ,our WEBSITE is https://example.com ."
>>> nfx.clean_text(t1,puncts=True,stopwords=True)
>>>'this mail examplegmailcom website httpsexamplecom'
>>> nfx.extract_emails(t1)
>>> ['[email protected]']
>>> from neattext.explainer import emojify
>>> emojify('Smiley')
>>> '😃'
>>> from neattext.explainer import emoji_explainer
>>> emoji_explainer('😃')
>>> 'SMILING FACE WITH OPEN MOUTH'
>>> from neattext.explainer import unicode_2_emoji
>>> unicode_2_emoji('0x1f49b')
'FLUSHED FACE'
>>> from neattext.pipeline import TextPipeline
>>> t1 = """This is the mail [email protected] ,our WEBSITE is https://example.com 😊. This is visa 4111 1111 1111 1111 and bitcoin 1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2 with mastercard 5500 0000 0000 0004. Send it to PO Box 555, KNU"""
>>> p = TextPipeline(steps=[remove_emails,remove_numbers,remove_emojis])
>>> p.fit(t1)
'This is the mail ,our WEBSITE is https://example.com . This is visa and bitcoin BvBMSEYstWetqTFnAumGFgxJaNVN with mastercard . Send it to PO Box , KNU'
>>> p.steps
>>> p.named_steps
Please read the documentation for more information on what neattext does and how to use is for your needs.You can also check out our readthedocs page here
clean-text
from Johannes Fillter and textify
by JCharisTech