Prevent unwanted cached bytes duplication on stream parsing.

pull/149/head
luccioman 8 years ago
parent ed678186a8
commit 8a94fef9e0

@ -271,24 +271,35 @@ public final class TextParser {
canStream = true; canStream = true;
} }
} }
} else if(sourceStream instanceof ByteArrayInputStream) {
/* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */
canStream = true;
} }
// if we do not have more than one non generic parser or the content size is over MaxInt (2GB) or is over the totally available memory // if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory,
// or stream is already in memory as a ByteArrayInputStream
// then we use only stream-oriented parser. // then we use only stream-oriented parser.
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) { if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
try { try {
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource /* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */ * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
int rewindSize = 10 * 1024; int rewindSize = 10 * 1024;
final BufferedInputStream bufferedStream = new BufferedInputStream(sourceStream, rewindSize); final InputStream markableStream;
if(sourceStream instanceof ByteArrayInputStream) {
/* No nead to use a wrapping buffered stream when the source is already entirely in memory.
* What's more, ByteArrayInputStream has no read limit when marking.*/
markableStream = sourceStream;
} else {
markableStream = new BufferedInputStream(sourceStream, rewindSize);
}
/* Mark now to allow resetting the buffered stream to the beginning of the stream */ /* Mark now to allow resetting the buffered stream to the beginning of the stream */
bufferedStream.mark(rewindSize); markableStream.mark(rewindSize);
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */ /* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
for(Parser parser : idioms) { for(Parser parser : idioms) {
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream /* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
* and so let us eventually reuse the same opened stream with other parsers on parser failure */ * and so let us eventually reuse the same opened stream with other parsers on parser failure */
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(bufferedStream); CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try { try {
return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset, return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset,
@ -296,7 +307,7 @@ public final class TextParser {
} catch (Parser.Failure e) { } catch (Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes : /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */ * too bad, the marks is invalid and process fails now with an IOException */
bufferedStream.reset(); markableStream.reset();
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) { && (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
@ -309,7 +320,7 @@ public final class TextParser {
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser; gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(bufferedStream); nonCloseInputStream = new CloseShieldInputStream(markableStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
@ -323,7 +334,7 @@ public final class TextParser {
return new Document[] { maindoc }; return new Document[] { maindoc };
} catch(Exception e1) { } catch(Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */ /* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
bufferedStream.reset(); markableStream.reset();
} }
} }
} }

Loading…
Cancel
Save