Professional Documents
Culture Documents
sqlite3 Scrapes_DeContrabas.sqlite3db
drop table if exists links;
create table links(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500)
);
create unique index if not exists idx_fullURL on links(fullURL);
drop table if exists decontrabas;
create table decontrabas(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500),
downloadSucceeded numeric,
parsingSucceeded numeric,
content BLOB
);
create unique index if not exists idx_decontrabas_fullURL on decontrabas(fullURL);
drop table if exists decontrabas_cleaned;
create table decontrabas_cleaned(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500),
downloadSucceeded numeric,
parsingSucceeded numeric,
content BLOB
);
import urllib2
import re
import os
from time import gmtime, strftime, localtime
from pysqlite2 import dbapi2 as sqlite
from BeautifulSoup import *
############################################################################
## Database class containing the scrape results
############################################################################
class ScrapeDatabase():
def __init__(self, path, dbname, dbtable):
self._dbpath = os.path.join(path, dbname)
self._dbtable = dbtable
def connect(self):
self._connection = sqlite.connect(self._dbpath)
cur = self._connection.cursor()
cur.execute("select max(rowid) from " + self._dbtable)
self._maxrowid = cur.fetchall()[0][0]
self._connection.commit()
cur.close()
def close(self):
self._connection.close()
def pr(self):
print self._maxrowid
def insertParsed(self, which, mainsite, searchTime, fullURL):
cur = self._connection.cursor()
searchTime = str(strftime("%Y%m%d %H:%M:%S", gmtime()))
cur.execute("INSERT INTO " + self._dbtable + " VALUES(?,?,?,?)", (which, mainsite, searchTime,
fullURL))
self._connection.commit()
cur.close()
def insertBeautifulSouped(self, table, which, mainsite, searchTime, fullURL, downloadSucceeded,
parsingSucceeded, content):
cur = self._connection.cursor()
cur.execute("INSERT INTO " + table + " VALUES(?,?,?,?,?,?,?)", (which, mainsite, searchTime,
fullURL, downloadSucceeded, parsingSucceeded, content))
self._connection.commit()
cur.close()
def getallurls(self):
cur = self._connection.cursor()
cur.execute("select distinct fullURL from " + self._dbtable)
urls = cur.fetchall()
self._connection.commit()
cur.close()
return urls
def getQuery(self, query):
cur = self._connection.cursor()
cur.execute(query)
urls = cur.fetchall()
self._connection.commit()
cur.close()
return urls
############################################################################
## Method to get all links
############################################################################
def fetchSearchResult(which, url, needoToContain, scrapedb):
## First loop
scrapedb.connect()
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
for link in soup.findAll('a', href=True):
href = link.get('href')
if re.search(needToContain, href) == None:
print href + " does not contain " + needToContain
else:
try:
scrapedb.insert(which, url, "", href)
except:
print "Insertion in database failed for " + href
## UP to 5 times
for i in range(0,5):
alreadythere = scrapedb.getallurls()
for urls in alreadythere:
url = urls[0]
errorMsg = "Loop" + str(i) + ":::" + url + ":::"
try:
page = urllib2.urlopen(url)
try:
soup = BeautifulSoup(page)
for link in soup.findAll('a', href=True):
href = link.get('href')
if re.search(needToContain, href) == None:
print href + " does not contain " + needToContain
else:
try:
scrapedb.insert(which, url, "", href)
except:
errorMsg = errorMsg + " insertion in database failed for " + href
except:
errorMsg = errorMsg + ", cannot parse document " + href
except:
errorMsg = errorMsg + ", cannot download document " + href
print errorMsg
## Close the database
scrapedb.close()
############################################################################
## Get all links
############################################################################
which = "De Contrabas"
url = "http://www.decontrabas.com"
needToContain = "contrabas"
scrapedb = ScrapeDatabase("/home/jan/HVG/scraping/decontrabas", "Scrapes_DeContrabas.sqlite3db",
"links")
#fetchSearchResult(which, url, needToContain, scrapedb)
############################################################################
## Fetch all links from the database
############################################################################
scrapedb.connect()
alreadythere = scrapedb.getQuery("select * from links;")
for record in alreadythere:
which = record[0]
mainsite = record[1]
searchTime = record[2]
fullURL = record[3]
downloadSucceeded = 0
parsingSucceeded = 0
content = None
print 'Start Parsing ' + fullURL
try:
page = urllib2.urlopen(fullURL)
downloadSucceeded = 1
try:
soup = BeautifulSoup(page)
parsingSucceeded = 1
content = soup.prettify() ## Puts the site content in UTF8
except:
print 'failed parsing ' + fullURL
except:
print 'failed download ' + fullURL
scrapedb.insertBeautifulSouped("decontrabas", which, mainsite, searchTime, fullURL,
downloadSucceeded, parsingSucceeded, content)
scrapedb.close()
############################################################################
## Parse the text from the sites
############################################################################
class Cluster:
text = ""
ratio = 0.0
bloc = None
def clusterPrint(self):
print str(self.text) + "\nRatio" + str(self.ratio)
def extractUsefulText(soup):
usefultext = ""
## Remove the content of the following html tags
tagsToRemove = ["form", "style", "script"]
tagObjects = soup.findAll(tagsToRemove)
[object.extract() for object in tagObjects]
## Remove comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
## Extract only useful content of html tags
## Only get the div (section), td (table cell), p (paragraph) and div:alt (text describing image)
from the body
## So no header tags h1, ... , h6
usefultextWithHTML = soup.body.findAll("div") + soup.body.findAll("td") + soup.body.findAll("p") +
soup.body.findAll('img',attrs={'class':'alt'})
## Still contains headers and footers and sidebars which are no use to us
## interesting machine learning article at
## (http://aidepot.com/articles/theeasywaytoextractusefultextfromarbitraryhtml/)
## for now, we use svn.mathieuleplatre.info/repositories/entry/uhm/uhm/download.py
## TODO: look how we can use R to learn the classification task
clusters = []
for tag in usefultextWithHTML:
text = tag.findAll(text=True)
text = "".join(text)
text = text.replace("\n", "")
x = len(text)
if x > 0:
## we have at least some text in the html tag
y = len(tag.prettify())
if y == 0:
ratio = x
else:
ratio = float(x)/y ## length of text wrt length including all html information in the
tag
c = Cluster()
c.text = text.lstrip().rstrip()
c.ratio = ratio
c.bloc = tag
clusters.append(c)
clusters.sort(cmp=lambda x,y: cmp(x.ratio, y.ratio))
max = clusters[1:][0]
for cluster in clusters[::1]:
## Minimum ratio to be able to speak of useful text
if cluster.ratio < 0.4:
break
if len(cluster.text) > len(max.text):
max = cluster
usefultext = max.bloc.findAll(text=True)
usefultext = "".join(usefultext)
usefultext = usefultext.replace("\n", "")
return usefultext
## Get the scrapes
scrapedb.connect()
scrapes = scrapedb.getQuery("select * from decontrabas;")
scrapedb.close()
## Get the content of the scrapes and store them
scrapedb.connect()
for scrape in scrapes:
try:
realtext = extractUsefulText(BeautifulSoup(scrape[6]))
scrapedb.insertBeautifulSouped("decontrabas_cleaned", scrape[0], scrape[1], scrape[2],
scrape[3], scrape[4], scrape[5], realtext)
except:
print 'failed extracting text for ' + scrape[3]
scrapedb.close()