You are on page 1of 4

cd /home/jan/HVG/scraping/decontrabas

sqlite3 Scrapes_DeContrabas.sqlite3db

drop table if exists links;
create table links(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500)
);
create unique index if not exists idx_fullURL on links(fullURL);

drop table if exists decontrabas;
create table decontrabas(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500),
downloadSucceeded numeric,
parsingSucceeded numeric,
content BLOB
);
create unique index if not exists idx_decontrabas_fullURL on decontrabas(fullURL);

drop table if exists decontrabas_cleaned;
create table decontrabas_cleaned(
which varchar(100),
mainsite varchar(100),
searchTime DATE,
fullURL varchar(500),
downloadSucceeded numeric,
parsingSucceeded numeric,
content BLOB
);

import urllib2
import re
import os
from time import gmtime, strftime, localtime
from pysqlite2 import dbapi2 as sqlite
from BeautifulSoup import *

############################################################################
## Database class containing the scrape results
############################################################################
class ScrapeDatabase():
    def __init__(self, path, dbname, dbtable):
        self._dbpath = os.path.join(path, dbname)
        self._dbtable = dbtable
    def connect(self):
        self._connection = sqlite.connect(self._dbpath)
        cur = self._connection.cursor()
        cur.execute("select max(rowid) from " + self._dbtable)
        self._maxrowid = cur.fetchall()[0][0]
        self._connection.commit()
        cur.close()
    def close(self):
        self._connection.close()
    def pr(self):
        print self._maxrowid
    def insertParsed(self, which, mainsite, searchTime, fullURL):
        cur = self._connection.cursor()
        searchTime = str(strftime("%Y­%m­%d %H:%M:%S", gmtime()))
        cur.execute("INSERT INTO " + self._dbtable + " VALUES(?,?,?,?)", (which, mainsite, searchTime, 
fullURL))
        self._connection.commit()
        cur.close()
    def insertBeautifulSouped(self, table, which, mainsite, searchTime, fullURL, downloadSucceeded, 
parsingSucceeded, content):
        cur = self._connection.cursor()
        cur.execute("INSERT INTO " + table + " VALUES(?,?,?,?,?,?,?)", (which, mainsite, searchTime, 
fullURL, downloadSucceeded, parsingSucceeded, content))
        self._connection.commit()
        cur.close()       
    def getallurls(self):
        cur = self._connection.cursor()
        cur.execute("select distinct fullURL from " + self._dbtable)
        urls = cur.fetchall()
        self._connection.commit()
        cur.close()
        return urls
    def getQuery(self, query):
        cur = self._connection.cursor()
        cur.execute(query)
        urls = cur.fetchall()
        self._connection.commit()
        cur.close()
        return urls

############################################################################
## Method to get all links
############################################################################
def fetchSearchResult(which, url, needoToContain, scrapedb):
    ## First loop
    scrapedb.connect()
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    for link in soup.findAll('a', href=True):
        href = link.get('href')
        if re.search(needToContain, href) == None:
            print href + " does not contain " + needToContain
        else:
            try:
                scrapedb.insert(which, url, "", href)
            except:
                print "Insertion in database failed for " + href
    ## UP to 5 times
    for i in range(0,5):
        alreadythere = scrapedb.getallurls()
        for urls in alreadythere:
            url = urls[0]
            errorMsg = "Loop" + str(i) + ":::" + url + ":::"
            try:
                page = urllib2.urlopen(url)
                try:
                    soup = BeautifulSoup(page)
                    for link in soup.findAll('a', href=True):
                        href = link.get('href')
                        if re.search(needToContain, href) == None:
                            print href + " does not contain " + needToContain 
                        else:
                            try:             
                                scrapedb.insert(which, url, "", href)
                            except:
                                errorMsg = errorMsg + " insertion in database failed for " + href
                except:
                    errorMsg = errorMsg + ", cannot parse document " + href
            except:
                errorMsg = errorMsg + ", cannot download document " + href
        print errorMsg
    ## Close the database
    scrapedb.close()

############################################################################
## Get all links
############################################################################
which = "De Contrabas"
url = "http://www.decontrabas.com"
needToContain = "contrabas"
scrapedb = ScrapeDatabase("/home/jan/HVG/scraping/decontrabas", "Scrapes_DeContrabas.sqlite3db", 
"links")
#fetchSearchResult(which, url, needToContain, scrapedb)

############################################################################
## Fetch all links from the database
############################################################################
scrapedb.connect()
alreadythere = scrapedb.getQuery("select * from links;")
for record in alreadythere:
    which = record[0]
    mainsite = record[1]
    searchTime = record[2]
    fullURL = record[3]
    downloadSucceeded = 0
    parsingSucceeded = 0
    content = None
    print 'Start Parsing ' + fullURL
    try:
        page = urllib2.urlopen(fullURL)
        downloadSucceeded = 1
        try:
            soup = BeautifulSoup(page)
            parsingSucceeded = 1
            content = soup.prettify() ## Puts the site content in UTF­8
        except:
            print 'failed parsing ' + fullURL
    except:
        print 'failed download ' + fullURL
    scrapedb.insertBeautifulSouped("decontrabas", which, mainsite, searchTime, fullURL, 
downloadSucceeded, parsingSucceeded, content)

scrapedb.close()

############################################################################
## Parse the text from the sites
############################################################################
class Cluster:
    text = ""
    ratio = 0.0
    bloc = None
    def clusterPrint(self):
        print str(self.text) + "\nRatio" + str(self.ratio)

def extractUsefulText(soup):
    usefultext = ""
    ## Remove the content of the following html tags
    tagsToRemove = ["form", "style", "script"]
    tagObjects = soup.findAll(tagsToRemove)
    [object.extract() for object in tagObjects]
    ## Remove comments
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    ## Extract only useful content of html tags
    ## Only get the div (section), td (table cell), p (paragraph) and div:alt (text describing image) 
from the body
    ## So no header tags h1, ... , h6
    usefultextWithHTML = soup.body.findAll("div") + soup.body.findAll("td") + soup.body.findAll("p") + 
soup.body.findAll('img',attrs={'class':'alt'})
    ## Still contains headers and footers and sidebars which are no use to us 
    ## interesting machine learning article at 
    ## (http://ai­depot.com/articles/the­easy­way­to­extract­useful­text­from­arbitrary­html/)
    ## for now, we use svn.mathieu­leplatre.info/repositories/entry/uhm/uhm/download.py
    ## TODO: look how we can use R to learn the classification task
    clusters = []
    for tag in usefultextWithHTML:
        text = tag.findAll(text=True)
        text = "".join(text)
        text = text.replace("\n", "")
        x = len(text)
        if x > 0:
            ## we have at least some text in the html tag
            y = len(tag.prettify())
            if y == 0:
                ratio = x
            else:
                ratio = float(x)/y ## length of text wrt length including all html information in the 
tag
                c = Cluster()
                c.text  = text.lstrip().rstrip()
                c.ratio = ratio
                c.bloc  = tag
                clusters.append(c)
        clusters.sort(cmp=lambda x,y: cmp(x.ratio, y.ratio))
        max = clusters[­1:][0]
        for cluster in clusters[::­1]:
            ## Minimum ratio to be able to speak of useful text
            if cluster.ratio < 0.4:
                break
            if len(cluster.text) > len(max.text):
                max = cluster
        usefultext = max.bloc.findAll(text=True)
        usefultext = "".join(usefultext)
        usefultext = usefultext.replace("\n", "")   
    return usefultext 

## Get the scrapes
scrapedb.connect()
scrapes = scrapedb.getQuery("select * from decontrabas;") 
scrapedb.close()  

## Get the content of the scrapes and store them
scrapedb.connect()
for scrape in scrapes:
    try:
       realtext = extractUsefulText(BeautifulSoup(scrape[6])) 
       scrapedb.insertBeautifulSouped("decontrabas_cleaned", scrape[0], scrape[1], scrape[2], 
scrape[3], scrape[4], scrape[5], realtext)
    except:
        print 'failed extracting text for ' + scrape[3]

scrapedb.close()          

You might also like