1、安装引用Jsoup
1) Maven中pom.xml
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2) 下载Jsoup的jar包手动引用
下载地址:https://jsoup.org/download
2、使用Jsoup解析百度搜索结果
1) 要获得信息的Model
public class Webpage {
// 标题
private String title;
// 链接
private String url;
// 简介
private String summary;
// 正文内容
private String content;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
2) 获取资源内容
public class TextExtract {
private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class);
private static List<String> lines;
private final static int blocksWidth;
private static int threshold;
private static String html;
private static boolean flag;
private static int start;
private static int end;
private static StringBuilder text;
private static ArrayList<Integer> indexDistribution;
static {
lines = new ArrayList<>();
indexDistribution = new ArrayList<>();
text = new StringBuilder();
blocksWidth = 3;
flag = false;
/* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/
/* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */
threshold = 86;
}
public static void setthreshold(int value) {
threshold = value;
}
/**
* 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。
*
* @param _html 网页HTML字符串
*
* @return 网页正文string
*/
public static String parse(String _html) {
return parse(_html, false);
}
/**
* 判断传入HTML,若是主题类网页,则抽取正文;否则输出<b>"unkown"</b>。
*
* @param _html 网页HTML字符串
* @param _flag true进行主题类判断, 省略此参数则默认为false
*
* @return 网页正文string
*/
public static String parse(String _html, boolean _flag) {
flag = _flag;
html = _html;
preProcess();
LOG.debug(html);
return getText();
}
private static void preProcess() {
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment
html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css
html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char
html = html.replaceAll("(?is)<.*?>", "");
//<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]-->
}
private static String getText() {
lines = Arrays.asList(html.split("\n"));
indexDistribution.clear();
for (int i = 0; i < lines.size() - blocksWidth; i++) {
int wordsNum = 0;
for (int j = i; j < i + blocksWidth; j++) {
lines.set(j, lines.get(j).replaceAll("\\s+", ""));
wordsNum += lines.get(j).length();
}
indexDistribution.add(wordsNum);
LOG.debug(wordsNum + "");
}
start = -1;
end = -1;
boolean boolstart = false, boolend = false;
text.setLength(0);
for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && !boolstart) {
if (indexDistribution.get(i + 1).intValue() != 0
|| indexDistribution.get(i + 2).intValue() != 0
|| indexDistribution.get(i + 3).intValue() != 0) {
boolstart = true;
start = i;
continue;
}
}
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i + 1).intValue() == 0) {
end = i;
boolend = true;
}
}
StringBuilder tmp = new StringBuilder();
if (boolend) {
LOG.debug(start + 1 + "\t\t" + end + 1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) {
continue;
}
tmp.append(lines.get(ii)).append("\n");
}
String str = tmp.toString();
LOG.debug(str);
if (str.contains("Copyright") || str.contains("版权所有")) {
continue;
}
text.append(str);
boolstart = boolend = false;
}
}
return text.toString();
}
}
3) 获取搜索结果
public class JSoupBaiduSearcher extends AbstractBaiduSearcher {
private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);
@Override
public List<Webpage> search(String keyword) {
return search(keyword, 1);
}
@Override
public List<Webpage> search(String keyword, int page) {
int pageSize = 10;
//百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数
//如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize
String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;
// SearchResult searchResult = new SearchResult();
// searchResult.setPage(page);
List<Webpage> webpages = new ArrayList<>();
try {
Document document = Jsoup.connect(url).get();
//获取搜索结果数目
int total = getBaiduSearchResultCount(document);
// searchResult.setTotal(total);
int len = 10;
if (total < 1) {
return null;
}
//如果搜索到的结果不足一页
if (total < 10) {
len = total;
}
for (int i = 0; i < len; i++) {
String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
LOG.debug("titleCssQuery:" + titleCssQuery);
LOG.debug("summaryCssQuery:" + summaryCssQuery);
Element titleElement = document.select(titleCssQuery).first();
String href = "";
String titleText = "";
if(titleElement != null){
titleText = titleElement.text();
href = titleElement.attr("href");
}else{
//处理百度百科
titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);
LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);
titleElement = document.select(titleCssQuery).first();
if(titleElement != null){
titleText = titleElement.text();
href = titleElement.attr("href");
}
}
LOG.debug(titleText);
Element summaryElement = document.select(summaryCssQuery).first();
//处理百度知道
if(summaryElement == null){
summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);
summaryElement = document.select(summaryCssQuery).first();
}
String summaryText = "";
if(summaryElement != null){
summaryText = summaryElement.text();
}
LOG.debug(summaryText);
if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
Webpage webpage = new Webpage();
webpage.setTitle(titleText);
webpage.setUrl(href);
webpage.setSummary(summaryText);
/*if (href != null) {
String content = Tools.getHTMLContent(href);
webpage.setContent(content);
} else {
LOG.info("页面正确提取失败");
}*/
webpages.add(webpage);
} else {
LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);
}
}
} catch (IOException ex) {
LOG.error("搜索出错",ex);
}
// searchResult.setWebpages(webpages);
return webpages;
}
/**
* 获取百度搜索结果数
* 获取如下文本并解析数字:
* 百度为您找到相关结果约13,200个
* @param document 文档
* @return 结果数
*/
private int getBaiduSearchResultCount(Document document){
String cssQuery = "html body div div div div.nums";
LOG.debug("total cssQuery: " + cssQuery);
Element totalElement = document.select(cssQuery).first();
String totalText = totalElement.text();
LOG.info("搜索结果文本:" + totalText);
String regEx="[^0-9]";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(totalText);
totalText = matcher.replaceAll("");
int total = Integer.parseInt(totalText);
LOG.info("搜索结果数:" + total);
return total;
}
public static void main(String[] args) {
Searcher searcher = new JSoupBaiduSearcher();
List<Webpage> webpages = searcher.search("六扇门",2);
if (webpages != null) {
int i = 2;
LOG.info("搜索结果 当前第 " + 1 + " 页,页面大小为:" + webpages.size() + " 共有结果数:" + webpages.size());
for (Webpage webpage : webpages) {
LOG.info("搜索结果 " + (i++) + " :");
LOG.info("标题:" + webpage.getTitle());
LOG.info("URL:" + webpage.getUrl());
LOG.info("摘要:" + webpage.getSummary());
LOG.info("正文:" + webpage.getContent());
LOG.info("");
}
} else {
LOG.error("没有搜索到结果");
}
}
项目源代码:https://github.com/ysc/search
3、使用Jsoup解析谷歌搜索结果
package crawler;
/**
* Created by Last on 10/16/2016.
*/
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import fileHandling.SaveInFile;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import POJO.ScrapedResult;
public class Crawler {
public static ArrayList<String> dates;
public static int delay;
public static String[] USER_AGENT_MAC = {
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)",
"Mozilla/5.0 (compatible; ABrowse 0.4; Syllable)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 1.1.4322)",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.7 (KHTML, like Gecko) Comodo_Dragon/16.1.1.0 Chrome/16.0.912.63 Safari/535.7",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.30 (KHTML, like Gecko) Comodo_Dragon/12.1.0.0 Chrome/12.0.742.91 Safari/534.30",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Comodo_Dragon/4.1.1.11 Chrome/4.1.249.1042 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.5) Gecko/20031026 Firebird/0.7",
"Mozilla/5.0 (Windows; U; Win98; de-DE; rv:1.5a) Gecko/20030728 Mozilla Firebird/0.6.1",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4a) Gecko/20030425 Mozilla Firebird/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:2.0) Treco/20110515 Fireweb Navigator/2.4",
"Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121202 Firefox/17.0 Iceweasel/17.0.1",
"Mozilla/5.0 (X11; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1 Iceweasel/15.0.1",
"Mozilla/5.0 (X11; debian; Linux x86_64; rv:15.0) Gecko/20100101 Iceweasel/15.0",
"Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0.1 Iceweasel/14.0.1",
"Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0 Iceweasel/14.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0.1 Iceweasel/13.0.1",
"Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0 Iceweasel/13.0"};
//private static final String USER_AGENT_MAC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36";
public static ArrayList<ScrapedResult> input(String query, ScrapedResult scrapedResult, String date) {
System.out.println("I'm here and the date is " + date.toString());
ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>();
// DateFormat formatter = new SimpleDateFormat("/MM/YYYY");
// String wanted = date;
try {
String[] date_all = date.split("/");
//wanted = (Date)formatter.parse(date);
// System.out.println("HAHAHAHAHA "+wanted.toString());
resultList = processPage(scrapedResult, date, "https://www.google.co.in/search?q=" + query + "&hl=en&gl=in&as_drrb=b&authuser=0&source=lnt&tbs=cdr%3A1%2Ccd_min%3A" + date_all[0] + "%2" + "F" + date_all[1] + "%2F" + date_all[2] + "%2Ccd_max%3A" + date_all[0] + "%2F" + date_all[1] + "%2F" + date_all[2] + "&tbm=nws");
DateFormat formatter = new SimpleDateFormat("DD/MM/YYYY");
String wanted = date;
date_all = date.split("/");
//wanted = (Date)formatter.parse(date);
System.out.println("HAHAHAHAHA " + wanted.toString());
resultList = processPage(scrapedResult, wanted, "https://www.google.co.in/search?q=" + query + "&hl=en&gl=in&as_drrb=b&authuser=0&source=lnt&tbs=cdr%3A1%2Ccd_min%3A" + date_all[0] + "%2" + "F" + date_all[1] + "%2F" + date_all[2] + "%2Ccd_max%3A" + date_all[0] + "%2F" + date_all[1] + "%2F" + date_all[2] + "&tbm=nws");
}
catch (IOException e) {
e.printStackTrace();
}
return resultList;
}
public ArrayList<String> pickDates() {
dates = new ArrayList<String>();
System.out.println("Dates");
//Scanner sc = new Scanner(System.in);
//String date = sc.nextLine();
//////////////////
String dt = "18/03/2009"; // Start date
SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
Calendar c = Calendar.getInstance();
try {
c.setTime(sdf.parse(dt));
} catch (ParseException e) {
e.printStackTrace();
}
while (!dt.toString().equals("31/10/2016")) {
c.add(Calendar.DATE, 1); // number of days to add
dt = sdf.format(c.getTime());
dates.add(dt.toString());
System.out.println("" + dt.toString());
System.out.println("111111111111111111111");
}
return dates;
}
public static void main (String[]args) throws IOException {
ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>();
Crawler crawler = new Crawler();
//delay = 5; // 5 seconds
dates = crawler.pickDates();
System.out.println("Dates size : " + dates.size());
ScrapedResult pojo = null;
SaveInFile saveInFile = new SaveInFile();
Scanner sc = new Scanner(System.in);
System.out.println("Query");
String query = sc.next();
query.replaceAll(" ", "+");
sc.close();
int i = 0;
for (String date :
dates) {
resultList = input(query, pojo, date);
saveInFile.writeFile(resultList);
for (ScrapedResult scr :
resultList) {
System.out.println("" + scr.getDate());
System.out.println("" + scr.getText());
}
}
}
public static ArrayList<ScrapedResult> processPage (ScrapedResult scrapedResult, String wanted, String url) throws
IOException {
ArrayList<ScrapedResult> resultList = new ArrayList<ScrapedResult>();
System.out.println("------ :\t\t" + url + "\t\t: ------");
Document doc = null;
try {
doc = Jsoup.connect(url).userAgent(USER_AGENT_MAC[new Random().nextInt(28)]).get();
if (doc != null) {
Elements Tags = doc.select("a.l._HId");
Iterator<Element> tagIterator = Tags.iterator();
ArrayList<String> links = new ArrayList<String>();
while (tagIterator.hasNext()) {
Element aTag = tagIterator.next();
links.add(aTag.attr("href"));
}
for (String linkI : links) {
if (linkI != null) {
System.out.println("**********************************[" + linkI + "]**********************************");
resultList.add(navigatePage(scrapedResult, wanted, linkI));
}
}
}
} catch (Exception e1) {
e1.printStackTrace();
return null;
}
try {
Thread.sleep( new Random().nextInt(120) * 1000);
} catch (InterruptedException ie) {
}
System.out.println("HA " + resultList.size());
return resultList;
}
private static ScrapedResult navigatePage (ScrapedResult scrapedResult, String wanted, String linkI){
scrapedResult = new ScrapedResult();
scrapedResult.setDate(wanted);
Document doc = null;
String resultText = null;
try {
doc = Jsoup.connect(linkI).userAgent(USER_AGENT_MAC[new Random().nextInt(28)]).get();
} catch (IOException e) {
e.printStackTrace();
}
if (doc != null) {
resultText = extractText(doc);
//System.out.println("Extracted");
}
if (resultText != null) {
scrapedResult.setText(resultText);
//System.out.println("Result ________"+resultText);
} else
scrapedResult.setText("NULL");
System.out.println("Final ______" + scrapedResult.getText());
if (doc != null)
resultText = extractText(doc);
if (resultText != null)
scrapedResult.setText(resultText);
else
scrapedResult.setText("NULL");
return scrapedResult;
}
private static String extractText (Document doc){
//System.out.println(doc.text());
String result = "";
result = result.concat(doc.select("p").text());
result = result.concat(doc.select("h1").text());
result = result.concat(doc.select("h2").text());
result = result.concat(doc.select("strong").text());
result = result.concat(doc.select("b").text());
result = result.concat(doc.select("summary").text());
return doc.text();
}
}
项目源代码:https://github.com/adishjain/GoogleResultScrapper
相关文档:
Java jsoup 请求Url地址及处理响应的JSON数据方法代码
Java jsoup post提交数据和获取提交cookie的方法代码