Tuesday, November 10, 2009

Using Jericho HTML Parser to convert relative paths to absolute paths

Here is some sample Java code for converting all the 'src' attributes in elements from a relative path to an absolute path. It is easy to extend this to 'href' elements as well.



package au.edu.apf.phenomebank.web;

import java.io.IOException;
import java.net.URL;
import java.util.List;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.MicrosoftTagTypes;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Source;

/**
 * Handles a Foreign HTML page
 *
 *
 * @author Philip Wu
 */
public class ForeignHtmlParser {

    public static String convertRelativeToAbsolutePaths(String absoluteBasePath, String urlString) throws IOException {
        
        MicrosoftTagTypes.register();
        
        Source source=new Source(new URL(urlString));
        
        OutputDocument outDoc = new OutputDocument(source);
        
        
        List<Element> elementList=source.getAllElements();
        for (Element element : elementList) {
            if (element.getAttributes()!=null) { 
                
                Attributes attributes = element.getAttributes();
                                
                for (Attribute att  : attributes) {
                    if (att.getName().toLowerCase().equals("src")) {
                        // Convert relative paths to absolute paths
                        if (! att.getValue().startsWith("http")) {
                            
                            // Build the Absolute path 
                            StringBuilder sb = new StringBuilder();
                            sb.append(absoluteBasePath);                            
                            if (! att.getValue().startsWith("/"))
                                sb.append("/");
                            sb.append(att.getValue());
                            
                            // Replace the attribute with the new one
                            outDoc.replace(att, "src='"+sb.toString()+"'");                                                         
                        }
 
                    }
                }
                
            }
            
        }

        //System.out.println("outDoc="+outDoc);
        return outDoc.toString();
    }
    
}

No comments:

Post a Comment