在本教程中,我们将向您展示如何从HTML页面提取超链接。 例如,要从以下内容获取链接:
this is text1 <a href='mkyong.com' target='_blank'>hello</a> this is text2...
- 首先从“价值”
a
标签-结果:a href='mkyong.com' target='_blank'
- 稍后从上面提取的值中获取“链接” –结果:
mkyong.com
1.正则表达式模式
提取标签正则表达式模式
(?i)<a([^>]+)>(.+?)</a>
从标签正则表达式模式中提取链接
\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));
描述
( #start of group #1
?i # all checking are case insensive
) #end of group #1
<a #start with "<a"
( # start of group #2
[^>]+ # anything except (">"), at least one character
) # end of group #2
> # follow by ">"
(.+?) # match anything
</a> # end with "</a>
\s* #can start with whitespace
(?i) # all checking are case insensive
href # follow by "href" word
\s*=\s* # allows spaces on either side of the equal sign,
( # start of group #1
"([^"]*") # allow string with double quotes enclosed - "string"
| # ..or
'[^']*' # allow string with single quotes enclosed - 'string'
| # ..or
([^'">]+) # can't contains one single quotes, double quotes ">"
) # end of group #1
2. Java链接提取器示例
这里有一个简单的Java链接提取例如,提取a
从第一个模式中标签值,并使用第二个模式来提取第一个模式中的链接。
HTMLLinkExtractor.java
package com.mkyong.crawler.core;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HTMLLinkExtractor {
private Pattern patternTag, patternLink;
private Matcher matcherTag, matcherLink;
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
private static final String HTML_A_HREF_TAG_PATTERN =
"\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public HTMLLinkExtractor() {
patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
/**
* Validate html with regular expression
*
* @param html
* html content for validation
* @return Vector links and link text
*/
public Vector<HtmlLink> grabHTMLLinks(final String html) {
Vector<HtmlLink> result = new Vector<HtmlLink>();
matcherTag = patternTag.matcher(html);
while (matcherTag.find()) {
String href = matcherTag.group(1); // href
String linkText = matcherTag.group(2); // link text
matcherLink = patternLink.matcher(href);
while (matcherLink.find()) {
String link = matcherLink.group(1); // link
HtmlLink obj = new HtmlLink();
obj.setLink(link);
obj.setLinkText(linkText);
result.add(obj);
}
}
return result;
}
class HtmlLink {
String link;
String linkText;
HtmlLink(){};
@Override
public String toString() {
return new StringBuffer("Link : ").append(this.link)
.append(" Link Text : ").append(this.linkText).toString();
}
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = replaceInvalidChar(link);
}
public String getLinkText() {
return linkText;
}
public void setLinkText(String linkText) {
this.linkText = linkText;
}
private String replaceInvalidChar(String link){
link = link.replaceAll("'", "");
link = link.replaceAll("\"", "");
return link;
}
}
}
3.单元测试
使用TestNG进行单元测试。 通过@DataProvider
模拟HTML内容。
TestHTMLLinkExtractor.java
package com.mkyong.crawler.core;
import java.util.Vector;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import com.mkyong.crawler.core.HTMLLinkExtractor.HtmlLink;
/**
* HTML link extrator Testing
*
* @author mkyong
*
*/
public class TestHTMLLinkExtractor {
private HTMLLinkExtractor htmlLinkExtractor;
String TEST_LINK = "http://www.google.com";
@BeforeClass
public void initData() {
htmlLinkExtractor = new HTMLLinkExtractor();
}
@DataProvider
public Object[][] HTMLContentProvider() {
return new Object[][] {
new Object[] { "abc hahaha <a href='" + TEST_LINK + "'>google</a>" },
new Object[] { "abc hahaha <a HREF='" + TEST_LINK + "'>google</a>" },
new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "'>google</A> , "
+ "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] { "abc hahaha <A HREF='" + TEST_LINK + "' target='_blank'>google</A>" },
new Object[] { "abc hahaha <A target='_blank' HREF='" + TEST_LINK + "'>google</A>" },
new Object[] { "abc hahaha <A target='_blank' HREF=\"" + TEST_LINK + "\">google</A>" },
new Object[] { "abc hahaha <a HREF=" + TEST_LINK + ">google</a>" }, };
}
@Test(dataProvider = "HTMLContentProvider")
public void ValidHTMLLinkTest(String html) {
Vector<HtmlLink> links = htmlLinkExtractor.grabHTMLLinks(html);
//there must have something
Assert.assertTrue(links.size() != 0);
for (int i = 0; i < links.size(); i++) {
HtmlLink htmlLinks = links.get(i);
//System.out.println(htmlLinks);
Assert.assertEquals(htmlLinks.getLink(), TEST_LINK);
}
}
}
结果
[TestNG] Running:
/private/var/folders/w8/jxyz5pf51lz7nmqm_hv5z5br0000gn/T/testng-eclipse--530204890/testng-customsuite.xml
PASSED: ValidHTMLLinkTest("abc hahaha <a href='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com'>google</A> , abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF="http://www.google.com">google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF=http://www.google.com>google</a>")