001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import static org.apache.commons.io.IOUtils.EOF;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.Reader;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.Charset;
027import java.nio.charset.CharsetEncoder;
028import java.nio.charset.CoderResult;
029import java.nio.charset.CodingErrorAction;
030import java.util.Objects;
031
032import org.apache.commons.io.Charsets;
033import org.apache.commons.io.IOUtils;
034import org.apache.commons.io.build.AbstractOrigin;
035import org.apache.commons.io.build.AbstractStreamBuilder;
036import org.apache.commons.io.charset.CharsetEncoders;
037
038/**
039 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
040 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
041 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
042 * <p>
043 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
044 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
045 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
046 * {@link java.io.BufferedReader}.
047 * </p>
048 * <p>
049 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2}
050 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
051 * </p>
052 * <p>
053 * To build an instance, see {@link Builder}.
054 * </p>
055 * <pre>
056 * InputStream inputStream = ...
057 * Charset cs = ...
058 * InputStreamReader reader = new InputStreamReader(inputStream, cs);
059 * ReaderInputStream in2 = ReaderInputStream.builder()
060 *   .setReader(reader)
061 *   .setCharset(cs)
062 *   .get();
063 * </pre>
064 * <p>
065 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes
066 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
067 * pulls it from the underlying stream.
068 * </p>
069 * <p>
070 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
071 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
072 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
073 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
074 * </p>
075 * <p>
076 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
077 * </p>
078 * <p>
079 * Instances of {@link ReaderInputStream} are not thread safe.
080 * </p>
081 *
082 * @see org.apache.commons.io.output.WriterOutputStream
083 * @since 2.0
084 */
085public class ReaderInputStream extends InputStream {
086
087    /**
088     * Builds a new {@link ReaderInputStream} instance.
089     * <p>
090     * For example:
091     * </p>
092     * <pre>{@code
093     * ReaderInputStream s = ReaderInputStream.builder()
094     *   .setPath(path)
095     *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
096     *   .get();}
097     * </pre>
098     *
099     * @since 2.12.0
100     */
101    public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
102
103        private CharsetEncoder charsetEncoder = newEncoder(getCharset());
104
105        /**
106         * Constructs a new instance.
107         * <p>
108         * This builder use the aspects Reader, Charset, CharsetEncoder, buffer size.
109         * </p>
110         * <p>
111         * You must provide an origin that can be converted to a Reader by this builder, otherwise, this call will throw an
112         * {@link UnsupportedOperationException}.
113         * </p>
114         *
115         * @return a new instance.
116         * @throws UnsupportedOperationException if the origin cannot provide a Reader.
117         * @throws IllegalStateException if the {@code origin} is {@code null}.
118         * @see AbstractOrigin#getReader(Charset)
119         */
120        @SuppressWarnings("resource")
121        @Override
122        public ReaderInputStream get() throws IOException {
123            return new ReaderInputStream(checkOrigin().getReader(getCharset()), charsetEncoder, getBufferSize());
124        }
125
126        CharsetEncoder getCharsetEncoder() {
127            return charsetEncoder;
128        }
129
130        @Override
131        public Builder setCharset(final Charset charset) {
132            super.setCharset(charset);
133            charsetEncoder = newEncoder(getCharset());
134            return this;
135        }
136
137        /**
138         * Sets the charset encoder. Assumes that the caller has configured the encoder.
139         *
140         * @param newEncoder the charset encoder, null resets to a default encoder.
141         * @return this
142         */
143        public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
144            charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
145            super.setCharset(charsetEncoder.charset());
146            return this;
147        }
148
149    }
150
151    /**
152     * Constructs a new {@link Builder}.
153     *
154     * @return a new {@link Builder}.
155     * @since 2.12.0
156     */
157    public static Builder builder() {
158        return new Builder();
159    }
160
161    static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
162        final float minRequired = minBufferSize(charsetEncoder);
163        if (bufferSize < minRequired) {
164            throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
165                    charsetEncoder.charset().displayName()));
166        }
167        return bufferSize;
168    }
169
170    static float minBufferSize(final CharsetEncoder charsetEncoder) {
171        return charsetEncoder.maxBytesPerChar() * 2;
172    }
173
174    private static CharsetEncoder newEncoder(final Charset charset) {
175        // @formatter:off
176        return Charsets.toCharset(charset).newEncoder()
177                .onMalformedInput(CodingErrorAction.REPLACE)
178                .onUnmappableCharacter(CodingErrorAction.REPLACE);
179        // @formatter:on
180    }
181
182    private final Reader reader;
183
184    private final CharsetEncoder charsetEncoder;
185
186    /**
187     * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
188     */
189    private final CharBuffer encoderIn;
190    /**
191     * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
192     * caller.
193     */
194    private final ByteBuffer encoderOut;
195
196    private CoderResult lastCoderResult;
197
198    private boolean endOfInput;
199
200    /**
201     * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
202     * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
203     *
204     * @param reader the target {@link Reader}
205     * @deprecated Use {@link ReaderInputStream#builder()} instead
206     */
207    @Deprecated
208    public ReaderInputStream(final Reader reader) {
209        this(reader, Charset.defaultCharset());
210    }
211
212    /**
213     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
214     *
215     * <p>
216     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
217     * </p>
218     *
219     * @param reader  the target {@link Reader}
220     * @param charset the charset encoding
221     * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
222     */
223    @Deprecated
224    public ReaderInputStream(final Reader reader, final Charset charset) {
225        this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
226    }
227
228    /**
229     * Constructs a new {@link ReaderInputStream}.
230     *
231     * <p>
232     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
233     * </p>
234     *
235     * @param reader     the target {@link Reader}.
236     * @param charset    the charset encoding.
237     * @param bufferSize the size of the input buffer in number of characters.
238     * @deprecated Use {@link ReaderInputStream#builder()} instead
239     */
240    @Deprecated
241    public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
242        // @formatter:off
243        this(reader,
244            Charsets.toCharset(charset).newEncoder()
245                    .onMalformedInput(CodingErrorAction.REPLACE)
246                    .onUnmappableCharacter(CodingErrorAction.REPLACE),
247             bufferSize);
248        // @formatter:on
249    }
250
251    /**
252     * Constructs a new {@link ReaderInputStream}.
253     *
254     * <p>
255     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
256     * an encoder which had already been in use.
257     * </p>
258     *
259     * @param reader         the target {@link Reader}
260     * @param charsetEncoder the charset encoder
261     * @since 2.1
262     * @deprecated Use {@link ReaderInputStream#builder()} instead
263     */
264    @Deprecated
265    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
266        this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
267    }
268
269    /**
270     * Constructs a new {@link ReaderInputStream}.
271     *
272     * <p>
273     * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
274     * an encoder which had already been in use.
275     * </p>
276     *
277     * @param reader         the target {@link Reader}
278     * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
279     * @param bufferSize     the size of the input buffer in number of characters
280     * @since 2.1
281     * @deprecated Use {@link ReaderInputStream#builder()} instead
282     */
283    @Deprecated
284    public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
285        this.reader = reader;
286        this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
287        this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
288        this.encoderIn.flip();
289        this.encoderOut = ByteBuffer.allocate(128);
290        this.encoderOut.flip();
291    }
292
293    /**
294     * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
295     *
296     * <p>
297     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
298     * </p>
299     *
300     * @param reader      the target {@link Reader}
301     * @param charsetName the name of the charset encoding
302     * @deprecated Use {@link ReaderInputStream#builder()} instead
303     */
304    @Deprecated
305    public ReaderInputStream(final Reader reader, final String charsetName) {
306        this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
307    }
308
309    /**
310     * Constructs a new {@link ReaderInputStream}.
311     *
312     * <p>
313     * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
314     * </p>
315     *
316     * @param reader      the target {@link Reader}
317     * @param charsetName the name of the charset encoding, null maps to the default Charset.
318     * @param bufferSize  the size of the input buffer in number of characters
319     * @deprecated Use {@link ReaderInputStream#builder()} instead
320     */
321    @Deprecated
322    public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
323        this(reader, Charsets.toCharset(charsetName), bufferSize);
324    }
325
326    /**
327     * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
328     *
329     * @throws IOException if an I/O error occurs.
330     */
331    @Override
332    public void close() throws IOException {
333        reader.close();
334    }
335
336    /**
337     * Fills the internal char buffer from the reader.
338     *
339     * @throws IOException If an I/O error occurs
340     */
341    private void fillBuffer() throws IOException {
342        if (endOfInput) {
343            return;
344        }
345        if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
346            encoderIn.compact();
347            final int position = encoderIn.position();
348            // We don't use Reader#read(CharBuffer) here because it is more efficient
349            // to write directly to the underlying char array (the default implementation
350            // copies data to a temporary char array).
351            final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
352            if (c == EOF) {
353                endOfInput = true;
354            } else {
355                encoderIn.position(position + c);
356            }
357            encoderIn.flip();
358        }
359        encoderOut.compact();
360        lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
361        if (endOfInput) {
362            lastCoderResult = charsetEncoder.flush(encoderOut);
363        }
364        if (lastCoderResult.isError()) {
365            lastCoderResult.throwException();
366        }
367        encoderOut.flip();
368    }
369
370    /**
371     * Gets the CharsetEncoder.
372     *
373     * @return the CharsetEncoder.
374     */
375    CharsetEncoder getCharsetEncoder() {
376        return charsetEncoder;
377    }
378
379    /**
380     * Reads a single byte.
381     *
382     * @return either the byte read or {@code -1} if the end of the stream has been reached
383     * @throws IOException if an I/O error occurs.
384     */
385    @Override
386    public int read() throws IOException {
387        for (;;) {
388            if (encoderOut.hasRemaining()) {
389                return encoderOut.get() & 0xFF;
390            }
391            fillBuffer();
392            if (endOfInput && !encoderOut.hasRemaining()) {
393                return EOF;
394            }
395        }
396    }
397
398    /**
399     * Reads the specified number of bytes into an array.
400     *
401     * @param b the byte array to read into
402     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
403     * @throws IOException if an I/O error occurs.
404     */
405    @Override
406    public int read(final byte[] b) throws IOException {
407        return read(b, 0, b.length);
408    }
409
410    /**
411     * Reads the specified number of bytes into an array.
412     *
413     * @param array the byte array to read into
414     * @param off   the offset to start reading bytes into
415     * @param len   the number of bytes to read
416     * @return the number of bytes read or {@code -1} if the end of the stream has been reached
417     * @throws IOException if an I/O error occurs.
418     */
419    @Override
420    public int read(final byte[] array, int off, int len) throws IOException {
421        Objects.requireNonNull(array, "array");
422        if (len < 0 || off < 0 || off + len > array.length) {
423            throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
424        }
425        int read = 0;
426        if (len == 0) {
427            return 0; // Always return 0 if len == 0
428        }
429        while (len > 0) {
430            if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
431                final int c = Math.min(encoderOut.remaining(), len);
432                encoderOut.get(array, off, c);
433                off += c;
434                len -= c;
435                read += c;
436            } else if (endOfInput) { // Already reach EOF in the last read
437                break;
438            } else { // Read again
439                fillBuffer();
440            }
441        }
442        return read == 0 && endOfInput ? EOF : read;
443    }
444}