package au.edu.apf.phenomebank.web;
import java.io.IOException;
import java.net.URL;
import java.util.List;
import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.MicrosoftTagTypes;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Source;
/**
* Handles a Foreign HTML page
*
*
* @author Philip Wu
*/
public class ForeignHtmlParser {
public static String convertRelativeToAbsolutePaths(String absoluteBasePath, String urlString) throws IOException {
MicrosoftTagTypes.register();
Source source=new Source(new URL(urlString));
OutputDocument outDoc = new OutputDocument(source);
List<Element> elementList=source.getAllElements();
for (Element element : elementList) {
if (element.getAttributes()!=null) {
Attributes attributes = element.getAttributes();
for (Attribute att : attributes) {
if (att.getName().toLowerCase().equals("src")) {
// Convert relative paths to absolute paths
if (! att.getValue().startsWith("http")) {
// Build the Absolute path
StringBuilder sb = new StringBuilder();
sb.append(absoluteBasePath);
if (! att.getValue().startsWith("/"))
sb.append("/");
sb.append(att.getValue());
// Replace the attribute with the new one
outDoc.replace(att, "src='"+sb.toString()+"'");
}
}
}
}
}
//System.out.println("outDoc="+outDoc);
return outDoc.toString();
}
}
Tips and experience about developing websites with various technologies
Tuesday, November 10, 2009
Using Jericho HTML Parser to convert relative paths to absolute paths
Here is some sample Java code for converting all the 'src' attributes in elements from a relative path to an absolute path. It is easy to extend this to 'href' elements as well.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment