import spacy
import pandas as pd
import numpy as np


from IPython.core.display import SVG
SVG(filename='SpacyArchitecture.svg')


with open("./data/sample-news-item.txt","r") as f:
     news_item = "".join(f.readlines())
news_item

"TAIPEI, April 15 (Reuter) - Taiwan's money rates finished\nmixed on Monday, dealers expecting overnight to rise further\namid current bullish stock market and income tax payments.\n    Overnight ended at 6.134 percent against Saturday's 5.949,\nwhile 30-day commercial paper fell to 7.00 from 7.10-7.15.\n    Though Taiwan share prices hit a new 11-month high on\nMonday, attracting liquidity into the stock market, bond traders\ndid not expect significantly tighter conditions as the central\nbank monetary policy remains relaxed.\n"


nlp = spacy.load('en_core_web_lg')
print(type(nlp))
nlp.pipeline

<class 'spacy.lang.en.English'>

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x189969a8ca0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x189969a9f00>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x189968822d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x18996b4b780>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x18996b7e680>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1899681fed0>)]


doc = nlp(news_item)
print(type(doc))
print(len(doc.vocab))

<class 'spacy.tokens.doc.Doc'>
840


([(t.text,type(t)) for t in doc])

[('TAIPEI', spacy.tokens.token.Token),
 (',', spacy.tokens.token.Token),
 ('April', spacy.tokens.token.Token),
 ('15', spacy.tokens.token.Token),
 ('(', spacy.tokens.token.Token),
 ('Reuter', spacy.tokens.token.Token),
 (')', spacy.tokens.token.Token),
 ('-', spacy.tokens.token.Token),
 ('Taiwan', spacy.tokens.token.Token),
 ("'s", spacy.tokens.token.Token),
 ('money', spacy.tokens.token.Token),
 ('rates', spacy.tokens.token.Token),
 ('finished', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('mixed', spacy.tokens.token.Token),
 ('on', spacy.tokens.token.Token),
 ('Monday', spacy.tokens.token.Token),
 (',', spacy.tokens.token.Token),
 ('dealers', spacy.tokens.token.Token),
 ('expecting', spacy.tokens.token.Token),
 ('overnight', spacy.tokens.token.Token),
 ('to', spacy.tokens.token.Token),
 ('rise', spacy.tokens.token.Token),
 ('further', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('amid', spacy.tokens.token.Token),
 ('current', spacy.tokens.token.Token),
 ('bullish', spacy.tokens.token.Token),
 ('stock', spacy.tokens.token.Token),
 ('market', spacy.tokens.token.Token),
 ('and', spacy.tokens.token.Token),
 ('income', spacy.tokens.token.Token),
 ('tax', spacy.tokens.token.Token),
 ('payments', spacy.tokens.token.Token),
 ('.', spacy.tokens.token.Token),
 ('\n    ', spacy.tokens.token.Token),
 ('Overnight', spacy.tokens.token.Token),
 ('ended', spacy.tokens.token.Token),
 ('at', spacy.tokens.token.Token),
 ('6.134', spacy.tokens.token.Token),
 ('percent', spacy.tokens.token.Token),
 ('against', spacy.tokens.token.Token),
 ('Saturday', spacy.tokens.token.Token),
 ("'s", spacy.tokens.token.Token),
 ('5.949', spacy.tokens.token.Token),
 (',', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('while', spacy.tokens.token.Token),
 ('30', spacy.tokens.token.Token),
 ('-', spacy.tokens.token.Token),
 ('day', spacy.tokens.token.Token),
 ('commercial', spacy.tokens.token.Token),
 ('paper', spacy.tokens.token.Token),
 ('fell', spacy.tokens.token.Token),
 ('to', spacy.tokens.token.Token),
 ('7.00', spacy.tokens.token.Token),
 ('from', spacy.tokens.token.Token),
 ('7.10', spacy.tokens.token.Token),
 ('-', spacy.tokens.token.Token),
 ('7.15', spacy.tokens.token.Token),
 ('.', spacy.tokens.token.Token),
 ('\n    ', spacy.tokens.token.Token),
 ('Though', spacy.tokens.token.Token),
 ('Taiwan', spacy.tokens.token.Token),
 ('share', spacy.tokens.token.Token),
 ('prices', spacy.tokens.token.Token),
 ('hit', spacy.tokens.token.Token),
 ('a', spacy.tokens.token.Token),
 ('new', spacy.tokens.token.Token),
 ('11', spacy.tokens.token.Token),
 ('-', spacy.tokens.token.Token),
 ('month', spacy.tokens.token.Token),
 ('high', spacy.tokens.token.Token),
 ('on', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('Monday', spacy.tokens.token.Token),
 (',', spacy.tokens.token.Token),
 ('attracting', spacy.tokens.token.Token),
 ('liquidity', spacy.tokens.token.Token),
 ('into', spacy.tokens.token.Token),
 ('the', spacy.tokens.token.Token),
 ('stock', spacy.tokens.token.Token),
 ('market', spacy.tokens.token.Token),
 (',', spacy.tokens.token.Token),
 ('bond', spacy.tokens.token.Token),
 ('traders', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('did', spacy.tokens.token.Token),
 ('not', spacy.tokens.token.Token),
 ('expect', spacy.tokens.token.Token),
 ('significantly', spacy.tokens.token.Token),
 ('tighter', spacy.tokens.token.Token),
 ('conditions', spacy.tokens.token.Token),
 ('as', spacy.tokens.token.Token),
 ('the', spacy.tokens.token.Token),
 ('central', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token),
 ('bank', spacy.tokens.token.Token),
 ('monetary', spacy.tokens.token.Token),
 ('policy', spacy.tokens.token.Token),
 ('remains', spacy.tokens.token.Token),
 ('relaxed', spacy.tokens.token.Token),
 ('.', spacy.tokens.token.Token),
 ('\n', spacy.tokens.token.Token)]


for token in doc:
    print(token.text, token.pos_, token.dep_)
    break

TAIPEI PROPN nsubj


print(doc[8], "\n",
      type(doc[8]),"\n", 
      doc[8].pos_,"\n",
      doc[8].dep_, "\n",
      doc[8].lemma_,"\n",
      doc[8].tag_, "\n",
      doc[1:9],"\n",
      type(doc[1:9]))

Taiwan 
 <class 'spacy.tokens.token.Token'> 
 PROPN 
 poss 
 Taiwan 
 NNP 
 , April 15 (Reuter) - Taiwan 
 <class 'spacy.tokens.span.Span'>


for i, sent in enumerate(doc.sents): print(f"Sentence {i} \n {sent} \n")

Sentence 0 
 TAIPEI, April 15 (Reuter) - Taiwan's money rates finished
mixed on Monday, dealers expecting overnight to rise further
amid current bullish stock market and income tax payments. 

Sentence 1 
 
    Overnight ended at 6.134 percent against Saturday's 5.949,
while 30-day commercial paper fell to 7.00 from 7.10-7.15. 

Sentence 2 
 
    Though Taiwan share prices hit a new 11-month high on
Monday, attracting liquidity into the stock market, bond traders
did not expect significantly tighter conditions as the central
bank monetary policy remains relaxed. 

Sentence 3


for ent in doc.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

TAIPEI - GPE - Countries, cities, states
April 15 - DATE - Absolute or relative dates or periods
Taiwan - GPE - Countries, cities, states
Monday - DATE - Absolute or relative dates or periods
Overnight - TIME - Times smaller than a day
6.134 percent - PERCENT - Percentage, including "%"
Saturday - DATE - Absolute or relative dates or periods
5.949 - CARDINAL - Numerals that do not fall under another type
30-day - DATE - Absolute or relative dates or periods
7.00 - CARDINAL - Numerals that do not fall under another type
7.10 - CARDINAL - Numerals that do not fall under another type
Taiwan - GPE - Countries, cities, states
11-month - DATE - Absolute or relative dates or periods
Monday - DATE - Absolute or relative dates or periods


from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)


for chunk in doc.noun_chunks:
    print(chunk.text)

TAIPEI
April
(Reuter
Taiwan's money rates
Monday
dealers
current bullish stock market and income tax payments

    Overnight
6.134 percent
30-day commercial paper
Taiwan share prices
a new 11-month high
Monday
liquidity
the stock market
bond traders
significantly tighter conditions
the central
bank monetary policy


tokens = [t for t in doc if t.is_space==False]
" ".join([t.text for t in tokens])

"TAIPEI , April 15 ( Reuter ) - Taiwan 's money rates finished mixed on Monday , dealers expecting overnight to rise further amid current bullish stock market and income tax payments . Overnight ended at 6.134 percent against Saturday 's 5.949 , while 30 - day commercial paper fell to 7.00 from 7.10 - 7.15 . Though Taiwan share prices hit a new 11 - month high on Monday , attracting liquidity into the stock market , bond traders did not expect significantly tighter conditions as the central bank monetary policy remains relaxed ."


tokens = [t for t in doc if t.is_punct==False]
" ".join([t.text for t in tokens])

"TAIPEI April 15 Reuter Taiwan 's money rates finished \n mixed on Monday dealers expecting overnight to rise further \n amid current bullish stock market and income tax payments \n     Overnight ended at 6.134 percent against Saturday 's 5.949 \n while 30 day commercial paper fell to 7.00 from 7.10 7.15 \n     Though Taiwan share prices hit a new 11 month high on \n Monday attracting liquidity into the stock market bond traders \n did not expect significantly tighter conditions as the central \n bank monetary policy remains relaxed \n"


len(nlp.Defaults.stop_words)
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 "n't",
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'no',
 'nobody',
 'none',
 'noone',
 'nor',
 'not',
 'nothing',
 'now',
 'nowhere',
 'n‘t',
 'n’t',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}


tokens = [t for t in doc if t.is_stop==False]
" ".join([t.text for t in tokens])

'TAIPEI , April 15 ( Reuter ) - Taiwan money rates finished \n mixed Monday , dealers expecting overnight rise \n amid current bullish stock market income tax payments . \n     Overnight ended 6.134 percent Saturday 5.949 , \n 30 - day commercial paper fell 7.00 7.10 - 7.15 . \n     Taiwan share prices hit new 11 - month high \n Monday , attracting liquidity stock market , bond traders \n expect significantly tighter conditions central \n bank monetary policy remains relaxed . \n'


tokens = [t for t in doc if t.like_num==False]
" ".join([t.text for t in tokens])

"TAIPEI , April ( Reuter ) - Taiwan 's money rates finished \n mixed on Monday , dealers expecting overnight to rise further \n amid current bullish stock market and income tax payments . \n     Overnight ended at percent against Saturday 's , \n while - day commercial paper fell to from - . \n     Though Taiwan share prices hit a new - month high on \n Monday , attracting liquidity into the stock market , bond traders \n did not expect significantly tighter conditions as the central \n bank monetary policy remains relaxed . \n"


tokens = [t for t in doc if t.is_alpha and not(t.is_space or t.is_punct or t.is_stop or t.like_num)]
print(" ".join([t.text for t in tokens ]))
print(" ".join([t.lemma_  for t in tokens ]))

TAIPEI April Reuter Taiwan money rates finished mixed Monday dealers expecting overnight rise amid current bullish stock market income tax payments Overnight ended percent Saturday day commercial paper fell Taiwan share prices hit new month high Monday attracting liquidity stock market bond traders expect significantly tighter conditions central bank monetary policy remains relaxed
TAIPEI April Reuter Taiwan money rate finish mix Monday dealer expect overnight rise amid current bullish stock market income tax payment Overnight end percent Saturday day commercial paper fall Taiwan share price hit new month high Monday attract liquidity stock market bond trader expect significantly tight condition central bank monetary policy remain relaxed


for i, token in enumerate(doc):
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)
    if i > 10:break

TAIPEI 	 PROPN 	 11625247729534412431 	 TAIPEI
, 	 PUNCT 	 2593208677638477497 	 ,
April 	 PROPN 	 6762527065225415734 	 April
15 	 NUM 	 13771760024209633521 	 15
( 	 PUNCT 	 12638816674900267446 	 (
Reuter 	 PROPN 	 4553298858418451913 	 Reuter
) 	 PUNCT 	 3842344029291005339 	 )
- 	 PUNCT 	 9153284864653046197 	 -
Taiwan 	 PROPN 	 9905814886496518454 	 Taiwan
's 	 PART 	 16428057658620181782 	 's
money 	 NOUN 	 14917444839815175757 	 money
rates 	 NOUN 	 17781086385965795670 	 rate


tokens = [t for t in doc if t.is_alpha and not(t.is_space or t.is_punct or t.is_stop or t.like_num)]
print(" ".join([t.pos_  for t in tokens ]))

PROPN PROPN PROPN PROPN NOUN NOUN VERB VERB PROPN NOUN VERB ADV VERB ADP ADJ ADJ NOUN NOUN NOUN NOUN NOUN PROPN VERB NOUN PROPN NOUN ADJ NOUN VERB PROPN NOUN NOUN VERB ADJ NOUN NOUN PROPN VERB NOUN NOUN NOUN NOUN NOUN VERB ADV ADJ NOUN ADJ NOUN ADJ NOUN VERB ADJ


news_items = pd.read_csv("./data/news-body-samples-v1.csv",sep="\t")
news_items.topic.value_counts()
cond       = news_items.apply(lambda x: 300<=len(x['body']) <=6000, axis=1)
news_items = news_items.assign(l_status = cond)
news_items = news_items[news_items.l_status==True]
news_items.topic.value_counts()

N2:COMARB    1484
N2:LOA       1368
Name: topic, dtype: int64


def my_tokenizer(text):
   tokens = [t for t in nlp(text) if t.is_alpha and not(t.is_space or t.is_punct or t.is_stop or t.like_num)]
   return [t.lemma_.lower().strip() if t.lemma_ != "-PRON-" else t.lower_  for t in tokens ]


from sklearn.feature_extraction.text import CountVectorizer
ct_vectorizer= CountVectorizer(tokenizer = my_tokenizer, ngram_range=(1,1),
                               min_df=0.2,
                               max_df=0.9,
                               max_features=100)


X = ct_vectorizer.fit_transform(news_items.iloc[:20,0].values)


feature_names = ct_vectorizer.get_feature_names()
print(feature_names)
print(len(feature_names))

['announce', 'arranger', 'bank', 'basis', 'bond', 'bullet', 'central', 'co', 'commitment', 'company', 'coupon', 'credit', 'date', 'debt', 'development', 'end', 'exchange', 'facility', 'fee', 'finance', 'firm', 'force', 'friday', 'fuji', 'fund', 'general', 'high', 'hong', 'interbank', 'international', 'investment', 'issue', 'kong', 'launch', 'lead', 'level', 'limit', 'loan', 'london', 'manager', 'market', 'maturity', 'month', 'morgan', 'newsroom', 'offer', 'par', 'pay', 'payment', 'percent', 'point', 'price', 'public', 'rate', 'repay', 'repayment', 'reply', 'reuter', 'say', 'set', 'statement', 'syndication', 'taipei', 'taiwan', 'tax', 'term', 'thursday', 'year', 'yield']
69


print(X.toarray().shape)
X.toarray()

(20, 69)

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 5, ..., 0, 2, 0],
       [0, 2, 2, ..., 0, 2, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 2, 3],
       [0, 1, 0, ..., 0, 1, 0]], dtype=int64)

Text Preprocessing-Demo

Context¶

Load Spacy Language Model¶

Load News Data¶

Spacy Pipeline¶

Create a `doc` object¶

Print all the tokens¶

Spacy attached attributes to each of the tokens¶

Explore various features : Slicing,Parts of Speech, Dependencies, Lemma, Span¶

Get Sentences¶

Named Entities¶

Noun Chunks¶

Remove Spaces¶

Removing punctuation¶

Removing Stopwords¶

Removing Numbers¶

Lemmatization¶

POS tagging¶

Create Document Term Matrix¶

Load the dataset¶

What are the features ?¶

View the Document Term Matrix¶

Text Preprocessing-Demo

Context¶

Load Spacy Language Model¶

Load News Data¶

Spacy Pipeline¶

Create a doc object¶

Print all the tokens¶

Spacy attached attributes to each of the tokens¶

Explore various features : Slicing,Parts of Speech, Dependencies, Lemma, Span¶

Get Sentences¶

Named Entities¶

Noun Chunks¶

Remove Spaces¶

Removing punctuation¶

Removing Stopwords¶

Removing Numbers¶

Lemmatization¶

POS tagging¶

Create Document Term Matrix¶

Load the dataset¶

What are the features ?¶

View the Document Term Matrix¶

Create a `doc` object¶