Comment extraire des liens HTML avec une expression régulière

Dans ce didacticiel, nous allons vous montrer comment extraire un lien hypertexte d'une page HTML. Par exemple, pour obtenir le lien à partir du contenu suivant:

this is text1 hello this is text2...
  1. Obtenez d'abord la «valeur» de la balisea - Résultat:a href='example.com' target='_blank'

  2. Récupérez plus tard le «lien» à partir de la valeur extraite ci-dessus - Résultat:example.com

1. Modèle d'expression régulière

Extraire un modèle d'expression régulière de balise

(?i)]+)>(.+?)

Extraire le lien d'un modèle d'expression régulière de balise

\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));

La description

(       #start of group #1
 ?i     #  all checking are case insensive
)       #end of group #1
]+    #     anything except (">"), at least one character
   )        #  end of group #2
  >      #     follow by ">"
    (.+?)   #   match anything
          #     end with "
\s*            #can start with whitespace
  (?i)             # all checking are case insensive
     href          #  follow by "href" word
        \s*=\s*        #   allows spaces on either side of the equal sign,
              (        #    start of group #1
               "([^"]*")   #      allow string with double quotes enclosed - "string"
               |       #      ..or
               '[^']*'     #        allow string with single quotes enclosed - 'string'
               |           #      ..or
               ([^'">]+)   #      can't contains one single quotes, double quotes ">"
          )        #    end of group #1

Voici un exemple simple d'extraction de lien Java, pour extraire la valeur de la balisea du premier modèle et utiliser le deuxième modèle pour extraire le lien du premier modèle.

HTMLLinkExtractor.java

package com.example.crawler.core;

import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HTMLLinkExtractor {

    private Pattern patternTag, patternLink;
    private Matcher matcherTag, matcherLink;

    private static final String HTML_A_TAG_PATTERN = "(?i)]+)>(.+?)";
    private static final String HTML_A_HREF_TAG_PATTERN =
        "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";


    public HTMLLinkExtractor() {
        patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
        patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
    }

    /**
     * Validate html with regular expression
     *
     * @param html
     *            html content for validation
     * @return Vector links and link text
     */
    public Vector grabHTMLLinks(final String html) {

        Vector result = new Vector();

        matcherTag = patternTag.matcher(html);

        while (matcherTag.find()) {

            String href = matcherTag.group(1); // href
            String linkText = matcherTag.group(2); // link text

            matcherLink = patternLink.matcher(href);

            while (matcherLink.find()) {

                String link = matcherLink.group(1); // link
                HtmlLink obj = new HtmlLink();
                obj.setLink(link);
                obj.setLinkText(linkText);

                result.add(obj);

            }

        }

        return result;

    }

    class HtmlLink {

        String link;
        String linkText;

        HtmlLink(){};

        @Override
        public String toString() {
            return new StringBuffer("Link : ").append(this.link)
            .append(" Link Text : ").append(this.linkText).toString();
        }

        public String getLink() {
            return link;
        }

        public void setLink(String link) {
            this.link = replaceInvalidChar(link);
        }

        public String getLinkText() {
            return linkText;
        }

        public void setLinkText(String linkText) {
            this.linkText = linkText;
        }

        private String replaceInvalidChar(String link){
            link = link.replaceAll("'", "");
            link = link.replaceAll("\"", "");
            return link;
        }

    }
}

3. Test de l'unité

Test unitaire avec TestNG. Simulez le contenu HTML via@DataProvider.

TestHTMLLinkExtractor.java

package com.example.crawler.core;

import java.util.Vector;

import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import com.example.crawler.core.HTMLLinkExtractor.HtmlLink;

/**
 * HTML link extrator Testing
 *
 * @author example
 *
 */
public class TestHTMLLinkExtractor {

    private HTMLLinkExtractor htmlLinkExtractor;
    String TEST_LINK = "http://www.google.com";

    @BeforeClass
    public void initData() {
        htmlLinkExtractor = new HTMLLinkExtractor();
    }

    @DataProvider
    public Object[][] HTMLContentProvider() {
      return new Object[][] {
        new Object[] { "abc hahaha google" },
        new Object[] { "abc hahaha google" },

        new Object[] { "abc hahaha google , "
        + "abc hahaha google" },

        new Object[] { "abc hahaha google" },
        new Object[] { "abc hahaha google" },
        new Object[] { "abc hahaha google" },
        new Object[] { "abc hahaha google" }, };
    }

    @Test(dataProvider = "HTMLContentProvider")
    public void ValidHTMLLinkTest(String html) {

        Vector links = htmlLinkExtractor.grabHTMLLinks(html);

        //there must have something
        Assert.assertTrue(links.size() != 0);

        for (int i = 0; i < links.size(); i++) {
            HtmlLink htmlLinks = links.get(i);
            //System.out.println(htmlLinks);
            Assert.assertEquals(htmlLinks.getLink(), TEST_LINK);
        }

    }
}

Résultat

[TestNG] Running:
  /private/var/folders/w8/jxyz5pf51lz7nmqm_hv5z5br0000gn/T/testng-eclipse--530204890/testng-customsuite.xml

PASSED: ValidHTMLLinkTest("abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google , abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google")
PASSED: ValidHTMLLinkTest("abc hahaha google")