diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java
index b55ac845a..281f5c813 100644
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@@ -20,6 +20,18 @@
package net.yacy.cora.util;
+import java.awt.Container;
+import java.awt.Dimension;
+import java.awt.Graphics;
+import java.awt.Image;
+import java.awt.MediaTracker;
+import java.awt.image.BufferedImage;
+import java.beans.PropertyChangeEvent;
+import java.beans.PropertyChangeListener;
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
import javax.imageio.ImageIO;
import javax.swing.JEditorPane;
import javax.swing.text.Document;
@@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
-import java.awt.Container;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Image;
-import java.awt.MediaTracker;
-import java.awt.image.BufferedImage;
-import java.beans.PropertyChangeEvent;
-import java.beans.PropertyChangeListener;
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+/**
+ * Convert html to an copy on disk-image in a other file format
+ * currently (pdf and/or jpg)
+ */
public class Html2Image {
// Mac
@@ -132,18 +139,32 @@ public class Html2Image {
}
/**
- * convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
- * @param pdf
- * @param image
+ * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
+ * using internal pdf library or external command line tool on linux or mac
+ * @param pdf input pdf file
+ * @param image output jpg file
* @param width
* @param height
- * @param density
+ * @param density (dpi)
* @param quality
* @return
*/
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
-
+
+ // convert pdf to jpg using internal pdfbox capability
+ if (OS.isWindows || !convert.exists()) {
+ try {
+ PDDocument pdoc = PDDocument.load(pdf);
+ PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
+ BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
+
+ return ImageIO.write(bi, "jpg", image);
+
+ } catch (IOException ex) { }
+ }
+
+ // convert on mac or linux using external command line utility
try {
// i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
// note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 9291bdb25..bb8ef3a3b 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -41,6 +41,7 @@ import java.util.HashSet;
import java.util.List;
import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
-import org.apache.pdfbox.pdfparser.PDFParser;
public class pdfParser extends AbstractParser implements Parser {
@@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher,
null,
null,
- 0.0f, 0.0f,
+ 0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
null,
diff --git a/test/java/net/yacy/cora/util/Html2ImageTest.java b/test/java/net/yacy/cora/util/Html2ImageTest.java
new file mode 100644
index 000000000..dfb010b91
--- /dev/null
+++ b/test/java/net/yacy/cora/util/Html2ImageTest.java
@@ -0,0 +1,37 @@
+package net.yacy.cora.util;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.List;
+import net.yacy.utils.translation.ExtensionsFileFilter;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+
+public class Html2ImageTest {
+
+ /**
+ * Test of pdf2image method, of class Html2Image.
+ */
+ @Test
+ public void testPdf2image() {
+ // collect pdf filenames in test directory
+ File pd = new File("test/parsertest");
+ List extensions = new ArrayList();
+ extensions.add("pdf");
+ FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
+ String[] pdffiles = pd.list(fileFilter);
+
+ for (String pdffilename : pdffiles) {
+ File pdffile = new File(pd, pdffilename);
+ File jpgfile = new File("test/DATA", pdffilename + ".jpg");
+ if (jpgfile.exists()) {
+ jpgfile.delete();
+ }
+ Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
+ assertTrue(jpgfile.exists());
+ }
+ }
+
+}