Professional Documents
Culture Documents
import urllib2
import string
from bs4 import BeautifulSoup
url = "https://www.iiitd.ac.in/people/faculty"
#doc = "The Bhudda, the Godhead, resides quite as comfortably in the circuits of
a digital computer or the gears of a cycle transmission as he does at the top o
f a mountain or in the petals of a flower. To think otherwise is to demean the B
uddha which is to demean oneself."
html_doc = urllib2.urlopen(url)
soup = BeautifulSoup(html_doc, 'html.parser')
source = soup.get_text()
s=""
for i in source:
try:
i.encode("utf-8")
s+=i
except:
s+=" "
faculty = s.encode("utf-8")
exclude = set(string.punctuation)
faculty = ''.join(ch for ch in faculty if ch not in exclude)
#print s
sentence_re = r'''(?x)
# abbreviations, e.g. U.S.A. (with optional last period)
([A-Z])(\.[A-Z])+\.?
# words with optional internal hyphens
| \w+(-\w+)*
# currency and percentages, e.g. $12.40, 82%
| \$?\d+(\.\d+)?%?
# ellipsis
| \.\.\.
# these are separate tokens
| [][.,;"'?():-_`]
'''
faculty = faculty.decode('utf8', 'ignore')
tokens2 = nltk.word_tokenize(faculty)
tokens3=[]
for i in tokens2:
if(len(i)>1):
if (i[1]=="x"):
tokens2.remove(i)
toks = nltk.regexp_tokenize(faculty, sentence_re)
print toks
#print tokens3
postoks = nltk.tag.pos_tag(toks)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
#print postoks
#print toks
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
grammar = r"""
NBAR:
# Nouns and Adjectives, terminated with Nouns
{<NN.*|JJ>*<NN.*>}
NP:
{<NBAR>}
# Above, connected with in/of/etc...
{<NBAR><IN><NBAR>}
"""
chunker = nltk.RegexpParser(grammar)
tree = chunker.parse(postoks)
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
word = stemmer.stem_word(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):