N Grams Python Program
N Grams Python Program
N Grams Python Program
Corpus = brown.words
Vocab = set(lower_case_corpus)
print(‘CORPUS EXAMPLE:’+str(lower_case_corpus[:30])+’\n\n’)
print(‘VOCAB EXAMPLE:’+str(list(vocab)[:10]))
CORPUS EXAMPLE:
[‘the’,’fulton’,’county’,’grand’,’jury’,’said’,’friday’,’an’,’investigation’,’of’,”atlanta’s”,’recent’,’primary’,’electi
on’,’produced’,’’’’,’no’,’evidence’,’’’’,’that’,’any’,’irregularities’,’took’,’place’,’.’,’the’,’jury’,’further’,’said’,’in’]
VOCAB EXAMPLE:[‘drudgery’,’one-
arm’,’growling’,’cutest’,’rain’,’hops’,”network’s”,’expressionists’,’polarization’,’gaussian’]
bigram_counts={}
trigram_counts={}
For I in range(len(lower_case_corpus)-2):
bigram = (lower_case_corpus[i],lower_case_corpus[i+1])
trigram = (lower_case_corpus[i],lower_case_corpus[i+2])
If bigram in bigram_counts.keys():
bigram_counts[bigram]+=1
else:
bigram_counts[bigram]=1
#keeping track of trigram counts
trigram_counts[trigram]+=1
else:
trigram_counts[trigram]=1
#Function takes sentences as input and suggests possible words that comes after the sentence
Def suggest_next_word(input_,bigram_counts,trigram_counts,vocab)
tokenized_input = word_tokenize(input_.lower())
last_bigram = tokenized_input[-2:]
Vocab_probabilities = {}
Test_trigram = (last_bigram[0],last_bigram[1],vocab_word)
Test_bigram = (last_bigram[0],last_bigram[1])
Test_trigram_count = trigram_counts.get(test_trigram,0)
Test_bigram_count = bigram_counts.get(test_bigram,0)
Probability = test_trigram_counts/test_bigram_count
Vocab_probabilities[vocab_word] = probability
#sorting the vocab probability in descending order to get top probable words
[(‘james’,0.17647058823529413),(‘of’,0.1568627450980392),(‘arthur’,0.1176470588352941)]
[(‘france’,0.3333333),(‘hearts’,0.1666666),(‘morocco’,0.0833333)]
[(‘the’,0.2),(‘germany’,0.1333),(‘some’,0.066667)]