You are on page 1of 3

# -*- coding: utf-8 -*import nltk

import urllib2
import string
from bs4 import BeautifulSoup
url = "https://www.iiitd.ac.in/people/faculty"
#doc = "The Bhudda, the Godhead, resides quite as comfortably in the circuits of
a digital computer or the gears of a cycle transmission as he does at the top o
f a mountain or in the petals of a flower. To think otherwise is to demean the B
uddha which is to demean oneself."

html_doc = urllib2.urlopen(url)
soup = BeautifulSoup(html_doc, 'html.parser')
source = soup.get_text()
s=""
for i in source:
try:
i.encode("utf-8")
s+=i
except:
s+=" "
faculty = s.encode("utf-8")
exclude = set(string.punctuation)
faculty = ''.join(ch for ch in faculty if ch not in exclude)

#print s
sentence_re = r'''(?x)
# abbreviations, e.g. U.S.A. (with optional last period)
([A-Z])(\.[A-Z])+\.?
# words with optional internal hyphens
| \w+(-\w+)*
# currency and percentages, e.g. $12.40, 82%
| \$?\d+(\.\d+)?%?
# ellipsis
| \.\.\.
# these are separate tokens
| [][.,;"'?():-_`]
'''
faculty = faculty.decode('utf8', 'ignore')
tokens2 = nltk.word_tokenize(faculty)

tokens3=[]
for i in tokens2:
if(len(i)>1):
if (i[1]=="x"):
tokens2.remove(i)
toks = nltk.regexp_tokenize(faculty, sentence_re)
print toks

#print tokens3

postoks = nltk.tag.pos_tag(toks)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
#print postoks
#print toks
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
grammar = r"""
NBAR:
# Nouns and Adjectives, terminated with Nouns
{<NN.*|JJ>*<NN.*>}
NP:
{<NBAR>}
# Above, connected with in/of/etc...
{<NBAR><IN><NBAR>}
"""
chunker = nltk.RegexpParser(grammar)
tree = chunker.parse(postoks)
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
word = stemmer.stem_word(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):

"""Checks conditions for acceptable word: length, stopword."""


accepted = bool(2 <= len(word) <= 40
and word.lower() not in stopwords)
return accepted
def get_terms(tree):
for leaf in leaves(tree):
term = [ normalise(word) for word, tag in leaf
if acceptable_word(word) ]
yield term
terms = get_terms(tree)
#for term in terms:
#for word in term:
#print word,
#print

You might also like