Comment extraire des liens HTML avec une expression régulière
Dans ce didacticiel, nous allons vous montrer comment extraire un lien hypertexte d'une page HTML. Par exemple, pour obtenir le lien à partir du contenu suivant:
this is text1 hello this is text2...
-
Obtenez d'abord la «valeur» de la balise
a
- Résultat:a href='example.com' target='_blank'
-
Récupérez plus tard le «lien» à partir de la valeur extraite ci-dessus - Résultat:
example.com
1. Modèle d'expression régulière
Extraire un modèle d'expression régulière de balise
(?i)]+)>(.+?)
Extraire le lien d'un modèle d'expression régulière de balise
\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));
La description
( #start of group #1 ?i # all checking are case insensive ) #end of group #1 ]+ # anything except (">"), at least one character ) # end of group #2 > # follow by ">" (.+?) # match anything # end with "
\s* #can start with whitespace (?i) # all checking are case insensive href # follow by "href" word \s*=\s* # allows spaces on either side of the equal sign, ( # start of group #1 "([^"]*") # allow string with double quotes enclosed - "string" | # ..or '[^']*' # allow string with single quotes enclosed - 'string' | # ..or ([^'">]+) # can't contains one single quotes, double quotes ">" ) # end of group #1
2. Exemple d'extraction de lien Java
Voici un exemple simple d'extraction de lien Java, pour extraire la valeur de la balisea
du premier modèle et utiliser le deuxième modèle pour extraire le lien du premier modèle.
HTMLLinkExtractor.java
package com.example.crawler.core; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HTMLLinkExtractor { private Pattern patternTag, patternLink; private Matcher matcherTag, matcherLink; private static final String HTML_A_TAG_PATTERN = "(?i)]+)>(.+?)"; private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; public HTMLLinkExtractor() { patternTag = Pattern.compile(HTML_A_TAG_PATTERN); patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN); } /** * Validate html with regular expression * * @param html * html content for validation * @return Vector links and link text */ public VectorgrabHTMLLinks(final String html) { Vector result = new Vector (); matcherTag = patternTag.matcher(html); while (matcherTag.find()) { String href = matcherTag.group(1); // href String linkText = matcherTag.group(2); // link text matcherLink = patternLink.matcher(href); while (matcherLink.find()) { String link = matcherLink.group(1); // link HtmlLink obj = new HtmlLink(); obj.setLink(link); obj.setLinkText(linkText); result.add(obj); } } return result; } class HtmlLink { String link; String linkText; HtmlLink(){}; @Override public String toString() { return new StringBuffer("Link : ").append(this.link) .append(" Link Text : ").append(this.linkText).toString(); } public String getLink() { return link; } public void setLink(String link) { this.link = replaceInvalidChar(link); } public String getLinkText() { return linkText; } public void setLinkText(String linkText) { this.linkText = linkText; } private String replaceInvalidChar(String link){ link = link.replaceAll("'", ""); link = link.replaceAll("\"", ""); return link; } } }
3. Test de l'unité
Test unitaire avec TestNG. Simulez le contenu HTML via@DataProvider
.
TestHTMLLinkExtractor.java
package com.example.crawler.core; import java.util.Vector; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import com.example.crawler.core.HTMLLinkExtractor.HtmlLink; /** * HTML link extrator Testing * * @author example * */ public class TestHTMLLinkExtractor { private HTMLLinkExtractor htmlLinkExtractor; String TEST_LINK = "http://www.google.com"; @BeforeClass public void initData() { htmlLinkExtractor = new HTMLLinkExtractor(); } @DataProvider public Object[][] HTMLContentProvider() { return new Object[][] { new Object[] { "abc hahaha google" }, new Object[] { "abc hahaha google" }, new Object[] { "abc hahaha google , " + "abc hahaha google" }, new Object[] { "abc hahaha google" }, new Object[] { "abc hahaha google" }, new Object[] { "abc hahaha google" }, new Object[] { "abc hahaha google" }, }; } @Test(dataProvider = "HTMLContentProvider") public void ValidHTMLLinkTest(String html) { Vectorlinks = htmlLinkExtractor.grabHTMLLinks(html); //there must have something Assert.assertTrue(links.size() != 0); for (int i = 0; i < links.size(); i++) { HtmlLink htmlLinks = links.get(i); //System.out.println(htmlLinks); Assert.assertEquals(htmlLinks.getLink(), TEST_LINK); } } }
Résultat
[TestNG] Running: /private/var/folders/w8/jxyz5pf51lz7nmqm_hv5z5br0000gn/T/testng-eclipse--530204890/testng-customsuite.xml PASSED: ValidHTMLLinkTest("abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google , abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google") PASSED: ValidHTMLLinkTest("abc hahaha google")