Tuesday, November 10, 2009

Using Jericho HTML Parser to convert relative paths to absolute paths

Here is some sample Java code for converting all the 'src' attributes in elements from a relative path to an absolute path. It is easy to extend this to 'href' elements as well.

package au.edu.apf.phenomebank.web;

import java.io.IOException;
import java.net.URL;
import java.util.List;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.MicrosoftTagTypes;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Source;

 * Handles a Foreign HTML page
 * @author Philip Wu
public class ForeignHtmlParser {

    public static String convertRelativeToAbsolutePaths(String absoluteBasePath, String urlString) throws IOException {
        Source source=new Source(new URL(urlString));
        OutputDocument outDoc = new OutputDocument(source);
        List<Element> elementList=source.getAllElements();
        for (Element element : elementList) {
            if (element.getAttributes()!=null) { 
                Attributes attributes = element.getAttributes();
                for (Attribute att  : attributes) {
                    if (att.getName().toLowerCase().equals("src")) {
                        // Convert relative paths to absolute paths
                        if (! att.getValue().startsWith("http")) {
                            // Build the Absolute path 
                            StringBuilder sb = new StringBuilder();
                            if (! att.getValue().startsWith("/"))
                            // Replace the attribute with the new one
                            outDoc.replace(att, "src='"+sb.toString()+"'");                                                         

        return outDoc.toString();

No comments:

Post a Comment