编程语言 php java

java实现抓取百度对某站点搜索结果

java HTML我帮您 1年前  0次浏览
htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。这个没有界面的浏览器,运行速度也是非常迅速的。
相关文件下载地址:
http://sourceforge.net/projects/htmlunit/files/
http://jaist.dl.sourceforge.net/project/htmlunit/htmlunit/2.15/htmlunit-2.15-bin.zip

我的需求是使用百度的高级新闻搜索,抓取指定站点新闻
手动搜索的设置如图所示
33
package com.html580;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;

import com.gargoylesoftware.htmlunit.ElementNotFoundException;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlHiddenInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlRadioButtonInput;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;

/**
* @description 抓取百度搜索结果
* @author dzf
* @site http://www.html580.com
* @version 2014-7-23
*/
public class BaiduSpider {

public static void main(String<> args) {
try {
final WebClient webclient = new WebClient();
final HtmlPage htmlpage = webclient.getPage("http://www.baidu.com/gaoji/advanced.html");

//搜索按钮
final HtmlForm form = htmlpage.getFormByName("f1");
final HtmlSubmitInput button = form.getInputByValue("百度一下");

//搜索结果-关键词
final HtmlTextInput textField = form.getInputByName("q1");
textField.setValueAttribute("HTML我帮您");

//分页条数
final HtmlSelect htmlSelet=form.getSelectByName("rn");
htmlSelet.setDefaultValue("10");

//网页的时间
final HtmlSelect htmlSeletlm=form.getSelectByName("rn");
htmlSeletlm.setDefaultValue("0");

//语言
final List radioButtonCts = form.getRadioButtonsByName("ct");
radioButtonCts.get(0).setChecked(true);
radioButtonCts.get(1).setChecked(false);
radioButtonCts.get(2).setChecked(false);

//文档格式
final HtmlSelect htmlSeletft=form.getSelectByName("ft");
htmlSeletft.setDefaultValue("");

//关键词位置
final List radioButtonq5s = form.getRadioButtonsByName("q5");
radioButtonq5s.get(0).setChecked(true);
radioButtonq5s.get(1).setChecked(false);
radioButtonq5s.get(2).setChecked(false);

//站内搜索 限定要搜索指定的网站
final HtmlTextInput htmlTextInputq6 = form.getInputByName("q6");
htmlTextInputq6.setDefaultValue("html580.com");

//隐藏值
final HtmlHiddenInput hiddenInputtn = form.getInputByName("tn");
hiddenInputtn.setDefaultValue("baiduadv");

final HtmlPage page2 = button.click();
String result = page2.asXml();
System.out.println(result);
webclient.closeAllWindows();
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (ElementNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}


发表评论