package htmltopdf; import com.lowagie.text.DocumentException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.CommentNode; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlNode; import org.htmlcleaner.PrettyXmlSerializer; import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNodeVisitor; import org.htmlcleaner.Utils; import org.xhtmlrenderer.pdf.ITextRenderer; public class HTMLtoPDF { static int cssCounter = 0; public static void main(String[] args) { try { final String site = "http://www.keysurvey.co.za"; final String page = "/company/"; final String cssUrl = "http://www.keysurvey.co.za"; URL url = new URL(site+page); CleanerProperties props = new CleanerProperties(); // HTMLCleaner part // set some properties to non-default values props.setTranslateSpecialEntities(true); props.setTransResCharsToNCR(true); props.setOmitComments(true); // do parsing TagNode tagNode = new HtmlCleaner(props).clean(url); tagNode.traverse(new TagNodeVisitor() { public boolean visit(TagNode tagNode, HtmlNode htmlNode) { if (htmlNode instanceof TagNode) { TagNode tag = (TagNode) htmlNode; String tagName = tag.getName(); if ("img".equals(tagName)) { String src = tag.getAttributeByName("src"); if (src != null && ! src.startsWith("http")) { tag.setAttribute("src", Utils.fullUrl(site, src)); } } if ("link".equals(tagName)) { String rel = tag.getAttributeByName("rel"); String href = tag.getAttributeByName("href"); if (href != null && "stylesheet".equals(rel)) { try { HttpClient client = new DefaultHttpClient(); String fullUrl = ""; if (href.startsWith("http")) fullUrl = href; else fullUrl = Utils.fullUrl(cssUrl, href); HttpGet get = new HttpGet(fullUrl); HttpResponse response = client.execute(get); HttpEntity entity = response.getEntity(); if (entity != null) { InputStream is = entity.getContent(); href = "css" + cssCounter + ".css"; cssCounter++; OutputStream os = new FileOutputStream(href); IOUtils.copy(is, os); } tag.setAttribute("href", href); } catch (IOException ex) { Logger.getLogger(HTMLtoPDF.class.getName()).log(Level.SEVERE, null, ex); } } } } else if (htmlNode instanceof CommentNode) { CommentNode comment = ((CommentNode) htmlNode); comment.getContent().append(" -- By HtmlCleaner"); } // tells visitor to continue traversing the DOM tree return true; } }); // serialize to xml file new PrettyXmlSerializer(props).writeToFile( tagNode, "page.xhtml", "utf-8"); // FlyingSaucer and iText part String inputFile = "page.xhtml"; String url2 = new File(inputFile).toURI().toURL().toString(); String outputFile = "firstdoc.pdf"; OutputStream os = new FileOutputStream(outputFile); ITextRenderer renderer = new ITextRenderer(); renderer.setDocument(url2); renderer.layout(); renderer.createPDF(os); os.close(); } catch (DocumentException ex) { Logger.getLogger(HTMLtoPDF.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(HTMLtoPDF.class.getName()).log(Level.SEVERE, null, ex); } } }
FlyingSaucer cannot read the CSS files over the web, so we have to save them locally. While parsing the HTML source of the page, for all of the link tags with rel='stylesheet', we need to save a local copy of the css file. Also relative links to the image files need to be replaced with the absolute URLs.
To compile and run the above example you will need the Jars supplied with the FlyingSaucer distribution (it includes XHTMLRenderer and iText), Apache HttpClient, Apache IOUtils and HTMLCleaner.
Overall the code works fairly well on various even poorly formatted pages. There are a few issues that I have noted that may prevent some pages from being rendered to PDF correctly. Here are some of them:
1. Some types of comments (i.e. .ClassName /* some comment */ { ... } ) are not supported by the CSS parser of FlyingSaucer . It simply stops parsing CSS without throwing any exceptions.
2. @import and url() properties of the CSS are not supported.
But overall for many pages the described solution should be goo enough, especially when you need to convert internal pages that you can properly prepare for the PDF conversion.
6 comments:
My gratitude, for taking out time to post useful code snippet!
It helped me a lot.
Hi ,
Nice article,..
But for few pages i am getting this error can you please advice me how to overcome this message
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/http/params/SyncBasicHttpParams
at org.apache.http.impl.client.DefaultHttpClient.createHttpParams(DefaultHttpClient.java:161)
at org.apache.http.impl.client.AbstractHttpClient.getParams(AbstractHttpClient.java:426)
at org.apache.http.impl.client.AbstractHttpClient.createClientConnectionManager(AbstractHttpClient.java:297)
at org.apache.http.impl.client.AbstractHttpClient.getConnectionManager(AbstractHttpClient.java:445)
at org.apache.http.impl.client.AbstractHttpClient.createHttpContext(AbstractHttpClient.java:274)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:797)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:754)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:732)
at HTMLtoPDF$1.visit(HTMLtoPDF.java:75)
at org.htmlcleaner.TagNode.traverseInternally(TagNode.java:648)
at org.htmlcleaner.TagNode.traverseInternally(TagNode.java:657)
at org.htmlcleaner.TagNode.traverseInternally(TagNode.java:657)
at org.htmlcleaner.TagNode.traverse(TagNode.java:641)
at HTMLtoPDF.main(HTMLtoPDF.java:53)
Thank you in advance
There is a java library availabe for PDF known as Aspose.PDF for Java that can not only convert HTML to PDF and vice versa but can create, open, edit and convert PDF to any other formats.
There is this one awesome JPG to PDF converter that is available. This JPG to PDF converter from PDFOnline is compatible with multiple formats and delivers power packed performance.
Thank you.
Post a Comment