small enhancements in pdf parser

pull/1/head
Michael Peter Christen 13 years ago
parent c6ba44468e
commit eadb58dd87

@ -34,7 +34,6 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -127,14 +126,14 @@ public class pdfParser extends AbstractParser implements Parser {
docTitle = MultiProtocolURI.unescape(location.getFileName()); docTitle = MultiProtocolURI.unescape(location.getFileName());
} }
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = UTF8.getBytes(""); byte[] contentBytes = new byte[0];
try { try {
// create a writer for output // create a writer for output
final PDFTextStripper stripper = new PDFTextStripper(); final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(3); // get first 3 pages (always) stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc)); writer.append(stripper.getText(pdfDoc));
contentBytes = UTF8.getBytes(writer.toString()); // remember text in case of interrupting thread contentBytes = writer.getBytes(); // remember text in case of interrupting thread
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default stripper.setEndPage(Integer.MAX_VALUE); // set to default
@ -151,7 +150,7 @@ public class pdfParser extends AbstractParser implements Parser {
t.join(3000); t.join(3000);
if (t.isAlive()) t.interrupt(); if (t.isAlive()) t.interrupt();
pdfDoc.close(); pdfDoc.close();
contentBytes = UTF8.getBytes(writer.toString()); // get final text before closing writer contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); writer.close();
} catch (final IOException e) { } catch (final IOException e) {
// close the writer // close the writer

Loading…
Cancel
Save