1、安装引用Jsoup
1) Maven中pom.xml
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2) 下载Jsoup的jar包手动引用
下载地址:https://jsoup.org/download
2、使用Jsoup解析百度搜索结果
1) 要获得信息的Model
public class Webpage { // 标题 private String title; // 链接 private String url; // 简介 private String summary; // 正文内容 private String content; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }
2) 获取资源内容
public class TextExtract { private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class); private static List<String> lines; private final static int blocksWidth; private static int threshold; private static String html; private static boolean flag; private static int start; private static int end; private static StringBuilder text; private static ArrayList<Integer> indexDistribution; static { lines = new ArrayList<>(); indexDistribution = new ArrayList<>(); text = new StringBuilder(); blocksWidth = 3; flag = false; /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/ /* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */ threshold = 86; } public static void setthreshold(int value) { threshold = value; } /** * 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。 * * @param _html 网页HTML字符串 * * @return 网页正文string */ public static String parse(String _html) { return parse(_html, false); } /** * 判断传入HTML,若是主题类网页,则抽取正文;否则输出<b>"unkown"</b>。 * * @param _html 网页HTML字符串 * @param _flag true进行主题类判断, 省略此参数则默认为false * * @return 网页正文string */ public static String parse(String _html, boolean _flag) { flag = _flag; html = _html; preProcess(); LOG.debug(html); return getText(); } private static void preProcess() { html = html.replaceAll("(?is)<!DOCTYPE.*?>", ""); html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char html = html.replaceAll("(?is)<.*?>", ""); //<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]--> } private static String getText() { lines = Arrays.asList(html.split("\n")); indexDistribution.clear(); for (int i = 0; i < lines.size() - blocksWidth; i++) { int wordsNum = 0; for (int j = i; j < i + blocksWidth; j++) { lines.set(j, lines.get(j).replaceAll("\\s+", "")); wordsNum += lines.get(j).length(); } indexDistribution.add(wordsNum); LOG.debug(wordsNum + ""); } start = -1; end = -1; boolean boolstart = false, boolend = false; text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { if (indexDistribution.get(i) > threshold && !boolstart) { if (indexDistribution.get(i + 1).intValue() != 0 || indexDistribution.get(i + 2).intValue() != 0 || indexDistribution.get(i + 3).intValue() != 0) { boolstart = true; start = i; continue; } } if (boolstart) { if (indexDistribution.get(i).intValue() == 0 || indexDistribution.get(i + 1).intValue() == 0) { end = i; boolend = true; } } StringBuilder tmp = new StringBuilder(); if (boolend) { LOG.debug(start + 1 + "\t\t" + end + 1); for (int ii = start; ii <= end; ii++) { if (lines.get(ii).length() < 5) { continue; } tmp.append(lines.get(ii)).append("\n"); } String str = tmp.toString(); LOG.debug(str); if (str.contains("Copyright") || str.contains("版权所有")) { continue; } text.append(str); boolstart = boolend = false; } } return text.toString(); } }
3) 获取搜索结果
public class JSoupBaiduSearcher extends AbstractBaiduSearcher { private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class); @Override public List<Webpage> search(String keyword) { return search(keyword, 1); } @Override public List<Webpage> search(String keyword, int page) { int pageSize = 10; //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数 //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword; // SearchResult searchResult = new SearchResult(); // searchResult.setPage(page); List<Webpage> webpages = new ArrayList<>(); try { Document document = Jsoup.connect(url).get(); //获取搜索结果数目 int total = getBaiduSearchResultCount(document); // searchResult.setTotal(total); int len = 10; if (total < 1) { return null; } //如果搜索到的结果不足一页 if (total < 10) { len = total; } for (int i = 0; i < len; i++) { String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a"; String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract"; LOG.debug("titleCssQuery:" + titleCssQuery); LOG.debug("summaryCssQuery:" + summaryCssQuery); Element titleElement = document.select(titleCssQuery).first(); String href = ""; String titleText = ""; if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); }else{ //处理百度百科 titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a"; summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p"; LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery); LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery); titleElement = document.select(titleCssQuery).first(); if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); } } LOG.debug(titleText); Element summaryElement = document.select(summaryCssQuery).first(); //处理百度知道 if(summaryElement == null){ summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font"); LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery); summaryElement = document.select(summaryCssQuery).first(); } String summaryText = ""; if(summaryElement != null){ summaryText = summaryElement.text(); } LOG.debug(summaryText); if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) { Webpage webpage = new Webpage(); webpage.setTitle(titleText); webpage.setUrl(href); webpage.setSummary(summaryText); /*if (href != null) { String content = Tools.getHTMLContent(href); webpage.setContent(content); } else { LOG.info("页面正确提取失败"); }*/ webpages.add(webpage); } else { LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText); } } } catch (IOException ex) { LOG.error("搜索出错",ex); } // searchResult.setWebpages(webpages); return webpages; } /** * 获取百度搜索结果数 * 获取如下文本并解析数字: * 百度为您找到相关结果约13,200个 * @param document 文档 * @return 结果数 */ private int getBaiduSearchResultCount(Document document){ String cssQuery = "html body div div div div.nums"; LOG.debug("total cssQuery: " + cssQuery); Element totalElement = document.select(cssQuery).first(); String totalText = totalElement.text(); LOG.info("搜索结果文本:" + totalText); String regEx="[^0-9]"; Pattern pattern = Pattern.compile(regEx); Matcher matcher = pattern.matcher(totalText); totalText = matcher.replaceAll(""); int total = Integer.parseInt(totalText); LOG.info("搜索结果数:" + total); return total; } public static void main(String[] args) { Searcher searcher = new JSoupBaiduSearcher(); List<Webpage> webpages = searcher.search("六扇门",2); if (webpages != null) { int i = 2; LOG.info("搜索结果 当前第 " + 1 + " 页,页面大小为:" + webpages.size() + " 共有结果数:" + webpages.size()); for (Webpage webpage : webpages) { LOG.info("搜索结果 " + (i++) + " :"); LOG.info("标题:" + webpage.getTitle()); LOG.info("URL:" + webpage.getUrl()); LOG.info("摘要:" + webpage.getSummary()); LOG.info("正文:" + webpage.getContent()); LOG.info(""); } } else { LOG.error("没有搜索到结果"); } }
项目源代码:https://github.com/ysc/search
3、使用Jsoup解析谷歌搜索结果
package crawler; /** * Created by Last on 10/16/2016. */ import java.io.IOException; import java.net.SocketTimeoutException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import fileHandling.SaveInFile; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import POJO.ScrapedResult; public class Crawler { public static ArrayList<String> dates; public static int delay; public static String[] USER_AGENT_MAC = { "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0", "Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)", "Mozilla/5.0 (compatible; ABrowse 0.4; Syllable)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 1.1.4322)", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.7 (KHTML, like Gecko) Comodo_Dragon/16.1.1.0 Chrome/16.0.912.63 Safari/535.7", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.30 (KHTML, like Gecko) Comodo_Dragon/12.1.0.0 Chrome/12.0.742.91 Safari/534.30", "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Comodo_Dragon/4.1.1.11 Chrome/4.1.249.1042 Safari/532.5", "Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.5) Gecko/20031026 Firebird/0.7", "Mozilla/5.0 (Windows; U; Win98; de-DE; rv:1.5a) Gecko/20030728 Mozilla Firebird/0.6.1", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4a) Gecko/20030425 Mozilla Firebird/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:2.0) Treco/20110515 Fireweb Navigator/2.4", "Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121202 Firefox/17.0 Iceweasel/17.0.1", "Mozilla/5.0 (X11; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1 Iceweasel/15.0.1", "Mozilla/5.0 (X11; debian; Linux x86_64; rv:15.0) Gecko/20100101 Iceweasel/15.0", "Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0.1 Iceweasel/14.0.1", "Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0 Iceweasel/14.0", "Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0.1 Iceweasel/13.0.1", "Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0 Iceweasel/13.0"}; //private static final String USER_AGENT_MAC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36"; public static ArrayList<ScrapedResult> input(String query, ScrapedResult scrapedResult, String date) { System.out.println("I'm here and the date is " + date.toString()); ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>(); // DateFormat formatter = new SimpleDateFormat("/MM/YYYY"); // String wanted = date; try { String[] date_all = date.split("/"); //wanted = (Date)formatter.parse(date); // System.out.println("HAHAHAHAHA "+wanted.toString()); resultList = processPage(scrapedResult, date, "https://www.google.co.in/search?q=" + query + "&hl=en&gl=in&as_drrb=b&authuser=0&source=lnt&tbs=cdr%3A1%2Ccd_min%3A" + date_all[0] + "%2" + "F" + date_all[1] + "%2F" + date_all[2] + "%2Ccd_max%3A" + date_all[0] + "%2F" + date_all[1] + "%2F" + date_all[2] + "&tbm=nws"); DateFormat formatter = new SimpleDateFormat("DD/MM/YYYY"); String wanted = date; date_all = date.split("/"); //wanted = (Date)formatter.parse(date); System.out.println("HAHAHAHAHA " + wanted.toString()); resultList = processPage(scrapedResult, wanted, "https://www.google.co.in/search?q=" + query + "&hl=en&gl=in&as_drrb=b&authuser=0&source=lnt&tbs=cdr%3A1%2Ccd_min%3A" + date_all[0] + "%2" + "F" + date_all[1] + "%2F" + date_all[2] + "%2Ccd_max%3A" + date_all[0] + "%2F" + date_all[1] + "%2F" + date_all[2] + "&tbm=nws"); } catch (IOException e) { e.printStackTrace(); } return resultList; } public ArrayList<String> pickDates() { dates = new ArrayList<String>(); System.out.println("Dates"); //Scanner sc = new Scanner(System.in); //String date = sc.nextLine(); ////////////////// String dt = "18/03/2009"; // Start date SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy"); Calendar c = Calendar.getInstance(); try { c.setTime(sdf.parse(dt)); } catch (ParseException e) { e.printStackTrace(); } while (!dt.toString().equals("31/10/2016")) { c.add(Calendar.DATE, 1); // number of days to add dt = sdf.format(c.getTime()); dates.add(dt.toString()); System.out.println("" + dt.toString()); System.out.println("111111111111111111111"); } return dates; } public static void main (String[]args) throws IOException { ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>(); Crawler crawler = new Crawler(); //delay = 5; // 5 seconds dates = crawler.pickDates(); System.out.println("Dates size : " + dates.size()); ScrapedResult pojo = null; SaveInFile saveInFile = new SaveInFile(); Scanner sc = new Scanner(System.in); System.out.println("Query"); String query = sc.next(); query.replaceAll(" ", "+"); sc.close(); int i = 0; for (String date : dates) { resultList = input(query, pojo, date); saveInFile.writeFile(resultList); for (ScrapedResult scr : resultList) { System.out.println("" + scr.getDate()); System.out.println("" + scr.getText()); } } } public static ArrayList<ScrapedResult> processPage (ScrapedResult scrapedResult, String wanted, String url) throws IOException { ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>(); System.out.println("------ :\t\t" + url + "\t\t: ------"); Document doc = null; try { doc = Jsoup.connect(url).userAgent(USER_AGENT_MAC[new Random().nextInt(28)]).get(); if (doc != null) { Elements Tags = doc.select("a.l._HId"); Iterator<Element> tagIterator = Tags.iterator(); ArrayList<String> links = new ArrayList<String>(); while (tagIterator.hasNext()) { Element aTag = tagIterator.next(); links.add(aTag.attr("href")); } for (String linkI : links) { if (linkI != null) { System.out.println("**********************************[" + linkI + "]**********************************"); resultList.add(navigatePage(scrapedResult, wanted, linkI)); } } } } catch (Exception e1) { e1.printStackTrace(); return null; } try { Thread.sleep( new Random().nextInt(120) * 1000); } catch (InterruptedException ie) { } System.out.println("HA " + resultList.size()); return resultList; } private static ScrapedResult navigatePage (ScrapedResult scrapedResult, String wanted, String linkI){ scrapedResult = new ScrapedResult(); scrapedResult.setDate(wanted); Document doc = null; String resultText = null; try { doc = Jsoup.connect(linkI).userAgent(USER_AGENT_MAC[new Random().nextInt(28)]).get(); } catch (IOException e) { e.printStackTrace(); } if (doc != null) { resultText = extractText(doc); //System.out.println("Extracted"); } if (resultText != null) { scrapedResult.setText(resultText); //System.out.println("Result ________"+resultText); } else scrapedResult.setText("NULL"); System.out.println("Final ______" + scrapedResult.getText()); if (doc != null) resultText = extractText(doc); if (resultText != null) scrapedResult.setText(resultText); else scrapedResult.setText("NULL"); return scrapedResult; } private static String extractText (Document doc){ //System.out.println(doc.text()); String result = ""; result = result.concat(doc.select("p").text()); result = result.concat(doc.select("h1").text()); result = result.concat(doc.select("h2").text()); result = result.concat(doc.select("strong").text()); result = result.concat(doc.select("b").text()); result = result.concat(doc.select("summary").text()); return doc.text(); } }
项目源代码:https://github.com/adishjain/GoogleResultScrapper
相关文档:
Java jsoup 请求Url地址及处理响应的JSON数据方法代码
Java jsoup post提交数据和获取提交cookie的方法代码