Professional Documents
Culture Documents
HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.net.*; import java.io.*; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.lang.model.element.Element; import javax.swing.text.Document; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.NodeList; import org.xml.sax.SAXException;
/** *
@Override public void service(HttpServletRequest request, HttpServletResponse response) throws ServletException,IOException { response.setContentType("text/html;charset=UTF-8"); PrintWriter out = response.getWriter(); String sourceLine=""; String scrapedHtml=""; try {
//String url1=request.getParameter("blogUrl"); String url1="http://in.m.yahoo.com/news"; //String url1="http://news.google.com/news?ned=us&topic=h&output=rss"; //String url1="http://edition.cnn.com/WORLD/"; System.out.println("The url is :"+url1); // The URL address of the page to open. // URL address = new URL("http://en.wikipedia.org/wiki/Cross-platform"); URL address = new URL(url1); // removing the repeated url string , so it displays only once in the output
// // //
Pattern style1 = Pattern.compile(url1); Matcher mstyle1 = style1.matcher(url1); while (mstyle1.find()) scrapedHtml = mstyle1.replaceAll("");
// Open the address and create a BufferedReader with the source code. InputStreamReader pageInput = new InputStreamReader(address.openStream()); BufferedReader source = new BufferedReader(pageInput); // System.out.println(source); // Append each new HTML line into one string. Add a tab character. while ((sourceLine = source.readLine()) != null) scrapedHtml += sourceLine + "\t"; //System.out.println("the scraped data is:"+scrapedHtml);
// Remove style tags & inclusive scrapedHtml Pattern style = Pattern.compile("<style.*?>.*?</style>"); Matcher mstyle = style.matcher(scrapedHtml); while (mstyle.find()) scrapedHtml = mstyle.replaceAll("");
// Remove script tags & inclusive scrapedHtml Pattern script = Pattern.compile("<script.*?>.*?</script>"); Matcher mscript = script.matcher(scrapedHtml); while (mscript.find()) scrapedHtml = mscript.replaceAll("");
//remove the white spaces in the scrapedHtml Pattern white =Pattern.compile("\\s"); Matcher mwhite=white.matcher(scrapedHtml); while(mwhite.find()) scrapedHtml=mwhite.replaceAll("");
// Remove primary HTML tags Pattern tag = Pattern.compile("<.*?>"); Matcher mtag = tag.matcher(scrapedHtml); while (mtag.find()) scrapedHtml = mtag.replaceAll("");
// Remove comment tags & inclusive scrapedHtml Pattern comment = Pattern.compile("<!--.*?-->"); Matcher mcomment = comment.matcher(scrapedHtml); while (mcomment.find()) scrapedHtml = mcomment.replaceAll("");
// Remove special characters, such as Pattern sChar = Pattern.compile("&.*?;"); Matcher msChar = sChar.matcher(scrapedHtml); while (msChar.find()) scrapedHtml = msChar.replaceAll("");
// Remove the tab characters. Replace with new line characters. Pattern nLineChar = Pattern.compile("\t+");
// scrape first line Pattern nLineChar1 = Pattern.compile("Yahoo! News:News"); Matcher mnLine1 = nLineChar1.matcher(scrapedHtml); while (mnLine1.find()) scrapedHtml = mnLine1.replaceAll("\n");
// out.println(scrapedHtml); Pattern nLineChar6 = Pattern.compile("More SectionsNationalWorldBusinessSportsAutosScience/TechEntertainmentLifestyleArchivePopularSet My LocationSign in "); Matcher mnLine6 = nLineChar6.matcher(scrapedHtml); while (mnLine6.find()) scrapedHtml = mnLine6.replaceAll(""); System.out.println(""+scrapedHtml); Pattern sChar1 = Pattern.compile("ANI"); Matcher msChar1 = sChar1.matcher(scrapedHtml); while (msChar1.find()) scrapedHtml = msChar1.replaceAll("#\\\n");
Pattern sChar2 = Pattern.compile("IANS"); Matcher msChar2 = sChar2.matcher(scrapedHtml); while (msChar2.find()) scrapedHtml = msChar2.replaceAll("#\\\n");
/* Pattern sChar4 = Pattern.compile("Tue"); Matcher msChar4 = sChar4.matcher(scrapedHtml); while (msChar4.find()) scrapedHtml = msChar4.replaceAll(""); */
// // //
Pattern style1 = Pattern.compile("IST"); Matcher mstyle1 = style1.matcher(scrapedHtml); while (mstyle1.find()) scrapedHtml = mstyle1.replaceAll("#\\\n"); // System.out.println("the scraped data is:"+scrapedHtml); System.out.println(""+scrapedHtml); out.println(scrapedHtml); }
finally { out.close(); } }