目的
如果网页是静态内容,就直接httpclient获取就行了。但如果网页是动态加载的,比如利用js加载,就需要等网页完全渲染之后再去获取内容。
使用的jar包
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
没用maven的,就下载jar包(依赖的有些多)
例1. 只获取渲染后的网页内容
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.junit.Test;
public class MyTest {
@Test
public void test() {
// 新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
// 是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(true);
// 启用JS
webClient.getOptions().setJavaScriptEnabled(true);
// 很重要,设置支持AJAX
webClient.setAjaxController(
new NicelyResynchronizingAjaxController());
HtmlPage page;
try {
page = webClient.getPage("https://www.baidu.com/");
System.out.println(page.asXml());
} catch (Exception ignored) {
}finally {
webClient.close();
}
}
}
例2. 模拟post请求登录,然后获取渲染后的网页内容
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import org.junit.Test;
public class MyTest {
@Test
public void test() {
// 新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
// 是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(true);
// 启用JS
webClient.getOptions().setJavaScriptEnabled(true);
// 很重要,设置支持AJAX
webClient.setAjaxController(
new NicelyResynchronizingAjaxController());
HtmlPage page = null;
try {
page = webClient.getPage("https://www.baidu.com/");
} catch (Exception ignored) {
}
if(page == null) {
return;
}
// 填入用户名和密码
try {
HtmlInput username = page.getElementByName("username");
if (username != null) {
username.setValueAttribute("admin");
}
} catch (Exception ignored) {
}
try {
HtmlInput userName = page.getElementByName("userName");
if (userName != null) {
userName.type("admin");
}
} catch (Exception ignored) {
}
try {
HtmlInput account = page.getElementByName("account");
if (account != null) {
account.type("admin");
}
} catch (Exception ignored) {
}
try {
HtmlInput user = page.getElementByName("user");
if (user != null) {
user.type("admin");
}
} catch (Exception ignored) {
}
try {
HtmlInput password = page.getElementByName("password");
password.setValueAttribute("1111");
} catch (Exception ignored) {
}
// 提交
try {
HtmlSubmitInput submit = page.getElementByName("submit");
HtmlPage nextPage = submit.click();
System.out.println(nextPage.asXml());
} catch (Exception ignored) {
} finally {
webClient.close();
}
}
}
注意
如果报错或者没有获取到渲染内容,开启js和css试试。有时候没开css也加载不出内容。。