You are on page 1of 6

import com.sun.xml.internal.fastinfoset.util.StringArray; import java.util.logging.Level; import java.util.logging.Logger; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.

HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.net.*; import java.io.*; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.lang.model.element.Element; import javax.swing.text.Document; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.NodeList; import org.xml.sax.SAXException;

/** *

* @author Sharan */ //@WebServlet(name="SingleScrapper", urlPatterns={"/SingleScrapper"}) public class scrabServlet extends HttpServlet {

@Override public void service(HttpServletRequest request, HttpServletResponse response) throws ServletException,IOException { response.setContentType("text/html;charset=UTF-8"); PrintWriter out = response.getWriter(); String sourceLine=""; String scrapedHtml=""; try {

//String url1=request.getParameter("blogUrl"); String url1="http://in.m.yahoo.com/news"; //String url1="http://news.google.com/news?ned=us&topic=h&output=rss"; //String url1="http://edition.cnn.com/WORLD/"; System.out.println("The url is :"+url1); // The URL address of the page to open. // URL address = new URL("http://en.wikipedia.org/wiki/Cross-platform"); URL address = new URL(url1); // removing the repeated url string , so it displays only once in the output

// // //

Pattern style1 = Pattern.compile(url1); Matcher mstyle1 = style1.matcher(url1); while (mstyle1.find()) scrapedHtml = mstyle1.replaceAll("");

// Open the address and create a BufferedReader with the source code. InputStreamReader pageInput = new InputStreamReader(address.openStream()); BufferedReader source = new BufferedReader(pageInput); // System.out.println(source); // Append each new HTML line into one string. Add a tab character. while ((sourceLine = source.readLine()) != null) scrapedHtml += sourceLine + "\t"; //System.out.println("the scraped data is:"+scrapedHtml);

// Remove style tags & inclusive scrapedHtml Pattern style = Pattern.compile("<style.*?>.*?</style>"); Matcher mstyle = style.matcher(scrapedHtml); while (mstyle.find()) scrapedHtml = mstyle.replaceAll("");

// Remove script tags & inclusive scrapedHtml Pattern script = Pattern.compile("<script.*?>.*?</script>"); Matcher mscript = script.matcher(scrapedHtml); while (mscript.find()) scrapedHtml = mscript.replaceAll("");

// //// //// //// // //

//remove the white spaces in the scrapedHtml Pattern white =Pattern.compile("\\s"); Matcher mwhite=white.matcher(scrapedHtml); while(mwhite.find()) scrapedHtml=mwhite.replaceAll("");

// Remove primary HTML tags Pattern tag = Pattern.compile("<.*?>"); Matcher mtag = tag.matcher(scrapedHtml); while (mtag.find()) scrapedHtml = mtag.replaceAll("");

// Remove comment tags & inclusive scrapedHtml Pattern comment = Pattern.compile("<!--.*?-->"); Matcher mcomment = comment.matcher(scrapedHtml); while (mcomment.find()) scrapedHtml = mcomment.replaceAll("");

// Remove special characters, such as &nbsp; Pattern sChar = Pattern.compile("&.*?;"); Matcher msChar = sChar.matcher(scrapedHtml); while (msChar.find()) scrapedHtml = msChar.replaceAll("");

// Remove the tab characters. Replace with new line characters. Pattern nLineChar = Pattern.compile("\t+");

Matcher mnLine = nLineChar.matcher(scrapedHtml); while (mnLine.find()) scrapedHtml = mnLine.replaceAll("\n");

// scrape first line Pattern nLineChar1 = Pattern.compile("Yahoo! News:News"); Matcher mnLine1 = nLineChar1.matcher(scrapedHtml); while (mnLine1.find()) scrapedHtml = mnLine1.replaceAll("\n");

// out.println(scrapedHtml); Pattern nLineChar6 = Pattern.compile("More SectionsNationalWorldBusinessSportsAutosScience/TechEntertainmentLifestyleArchivePopularSet My LocationSign in "); Matcher mnLine6 = nLineChar6.matcher(scrapedHtml); while (mnLine6.find()) scrapedHtml = mnLine6.replaceAll(""); System.out.println(""+scrapedHtml); Pattern sChar1 = Pattern.compile("ANI"); Matcher msChar1 = sChar1.matcher(scrapedHtml); while (msChar1.find()) scrapedHtml = msChar1.replaceAll("#\\\n");

Pattern sChar2 = Pattern.compile("IANS"); Matcher msChar2 = sChar2.matcher(scrapedHtml); while (msChar2.find()) scrapedHtml = msChar2.replaceAll("#\\\n");

Pattern sChar3 = Pattern.compile("PTI"); Matcher msChar3 = sChar3.matcher(scrapedHtml);

while (msChar3.find()) scrapedHtml = msChar3.replaceAll("#\\\n");

/* Pattern sChar4 = Pattern.compile("Tue"); Matcher msChar4 = sChar4.matcher(scrapedHtml); while (msChar4.find()) scrapedHtml = msChar4.replaceAll(""); */

// // //

Pattern style1 = Pattern.compile("IST"); Matcher mstyle1 = style1.matcher(scrapedHtml); while (mstyle1.find()) scrapedHtml = mstyle1.replaceAll("#\\\n"); // System.out.println("the scraped data is:"+scrapedHtml); System.out.println(""+scrapedHtml); out.println(scrapedHtml); }

finally { out.close(); } }

You might also like