Package org.jsoup.parser
Class CharacterReader
- java.lang.Object
-
- org.jsoup.parser.CharacterReader
-
public final class CharacterReader extends java.lang.Object
CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
-
-
Field Summary
Fields Modifier and Type Field Description private int
bufLength
private int
bufMark
private int
bufPos
private int
bufSplitPoint
private char[]
charBuf
(package private) static char
EOF
private int
lastIcIndex
private java.lang.String
lastIcSeq
private int
lineNumberOffset
(package private) static int
maxBufferLen
private static int
maxStringCacheLen
private static int
minReadAheadLen
private java.util.ArrayList<java.lang.Integer>
newlinePositions
(package private) static int
readAheadLimit
private java.io.Reader
reader
private int
readerPos
private boolean
readFully
private java.lang.String[]
stringCache
private static int
stringCacheSize
-
Constructor Summary
Constructors Constructor Description CharacterReader(java.io.Reader input)
CharacterReader(java.io.Reader input, int sz)
CharacterReader(java.lang.String input)
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Modifier and Type Method Description void
advance()
Moves the current position by one.private void
bufferUp()
private static java.lang.String
cacheString(char[] charBuf, java.lang.String[] stringCache, int start, int count)
Caches short strings, as a flyweight pattern, to reduce GC load.void
close()
int
columnNumber()
Get the current column number (that the reader has consumed to).(package private) int
columnNumber(int pos)
(package private) char
consume()
(package private) java.lang.String
consumeAttributeQuoted(boolean single)
(package private) java.lang.String
consumeData()
(package private) java.lang.String
consumeDigitSequence()
(package private) java.lang.String
consumeHexSequence()
(package private) java.lang.String
consumeLetterSequence()
(package private) java.lang.String
consumeLetterThenDigitSequence()
(package private) java.lang.String
consumeRawData()
(package private) java.lang.String
consumeTagName()
java.lang.String
consumeTo(char c)
Reads characters up to the specific char.(package private) java.lang.String
consumeTo(java.lang.String seq)
java.lang.String
consumeToAny(char... chars)
Read characters until the first of any delimiters is found.(package private) java.lang.String
consumeToAnySorted(char... chars)
(package private) java.lang.String
consumeToEnd()
(package private) boolean
containsIgnoreCase(java.lang.String seq)
Used to check presence of , when we're in RCData and see achar
current()
Get the char at the current position.boolean
isEmpty()
Tests if all the content has been read.private boolean
isEmptyNoBufferUp()
boolean
isTrackNewlines()
Check if the tracking of newlines is enabled.int
lineNumber()
Get the current line number (that the reader has consumed to).(package private) int
lineNumber(int pos)
private int
lineNumIndex(int pos)
(package private) void
mark()
(package private) boolean
matchConsume(java.lang.String seq)
(package private) boolean
matchConsumeIgnoreCase(java.lang.String seq)
(package private) boolean
matches(char c)
(package private) boolean
matches(java.lang.String seq)
(package private) boolean
matchesAny(char... seq)
(package private) boolean
matchesAnySorted(char[] seq)
(package private) boolean
matchesAsciiAlpha()
Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha(package private) boolean
matchesDigit()
(package private) boolean
matchesIgnoreCase(java.lang.String seq)
(package private) boolean
matchesLetter()
(package private) int
nextIndexOf(char c)
Returns the number of characters between the current position and the next instance of the input char(package private) int
nextIndexOf(java.lang.CharSequence seq)
Returns the number of characters between the current position and the next instance of the input sequenceint
pos()
Gets the position currently read to in the content.(package private) java.lang.String
posLineCol()
Get a formatted string representing the current line and column positions.(package private) static boolean
rangeEquals(char[] charBuf, int start, int count, java.lang.String cached)
Check if the value of the provided range equals the string.(package private) boolean
rangeEquals(int start, int count, java.lang.String cached)
(package private) boolean
readFully()
Tests if the buffer has been fully read.(package private) void
rewindToMark()
private void
scanBufferForNewlines()
Scans the buffer for newline position, and tracks their location in newlinePositions.java.lang.String
toString()
void
trackNewlines(boolean track)
Enables or disables line number tracking.(package private) void
unconsume()
Unconsume one character (bufPos--).(package private) void
unmark()
-
-
-
Field Detail
-
EOF
static final char EOF
- See Also:
- Constant Field Values
-
maxStringCacheLen
private static final int maxStringCacheLen
- See Also:
- Constant Field Values
-
maxBufferLen
static final int maxBufferLen
- See Also:
- Constant Field Values
-
readAheadLimit
static final int readAheadLimit
- See Also:
- Constant Field Values
-
minReadAheadLen
private static final int minReadAheadLen
- See Also:
- Constant Field Values
-
charBuf
private char[] charBuf
-
reader
private java.io.Reader reader
-
bufLength
private int bufLength
-
bufSplitPoint
private int bufSplitPoint
-
bufPos
private int bufPos
-
readerPos
private int readerPos
-
bufMark
private int bufMark
-
stringCacheSize
private static final int stringCacheSize
- See Also:
- Constant Field Values
-
stringCache
private java.lang.String[] stringCache
-
newlinePositions
private java.util.ArrayList<java.lang.Integer> newlinePositions
-
lineNumberOffset
private int lineNumberOffset
-
readFully
private boolean readFully
-
lastIcSeq
private java.lang.String lastIcSeq
-
lastIcIndex
private int lastIcIndex
-
-
Method Detail
-
close
public void close()
-
bufferUp
private void bufferUp()
-
pos
public int pos()
Gets the position currently read to in the content. Starts at 0.- Returns:
- current position
-
readFully
boolean readFully()
Tests if the buffer has been fully read.
-
trackNewlines
public void trackNewlines(boolean track)
Enables or disables line number tracking. By default, will be off.Tracking line numbers improves the legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of use.- Parameters:
track
- set tracking on|off- Since:
- 1.14.3
-
isTrackNewlines
public boolean isTrackNewlines()
Check if the tracking of newlines is enabled.- Returns:
- the current newline tracking state
- Since:
- 1.14.3
-
lineNumber
public int lineNumber()
Get the current line number (that the reader has consumed to). Starts at line #1.- Returns:
- the current line number, or 1 if line tracking is not enabled.
- Since:
- 1.14.3
- See Also:
trackNewlines(boolean)
-
lineNumber
int lineNumber(int pos)
-
columnNumber
public int columnNumber()
Get the current column number (that the reader has consumed to). Starts at column #1.- Returns:
- the current column number
- Since:
- 1.14.3
- See Also:
trackNewlines(boolean)
-
columnNumber
int columnNumber(int pos)
-
posLineCol
java.lang.String posLineCol()
Get a formatted string representing the current line and column positions. E.g.5:10
indicating line number 5 and column number 10.- Returns:
- line:col position
- Since:
- 1.14.3
- See Also:
trackNewlines(boolean)
-
lineNumIndex
private int lineNumIndex(int pos)
-
scanBufferForNewlines
private void scanBufferForNewlines()
Scans the buffer for newline position, and tracks their location in newlinePositions.
-
isEmpty
public boolean isEmpty()
Tests if all the content has been read.- Returns:
- true if nothing left to read.
-
isEmptyNoBufferUp
private boolean isEmptyNoBufferUp()
-
current
public char current()
Get the char at the current position.- Returns:
- char
-
consume
char consume()
-
unconsume
void unconsume()
Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
-
advance
public void advance()
Moves the current position by one.
-
mark
void mark()
-
unmark
void unmark()
-
rewindToMark
void rewindToMark()
-
nextIndexOf
int nextIndexOf(char c)
Returns the number of characters between the current position and the next instance of the input char- Parameters:
c
- scan target- Returns:
- offset between current position and next instance of target. -1 if not found.
-
nextIndexOf
int nextIndexOf(java.lang.CharSequence seq)
Returns the number of characters between the current position and the next instance of the input sequence- Parameters:
seq
- scan target- Returns:
- offset between current position and next instance of target. -1 if not found.
-
consumeTo
public java.lang.String consumeTo(char c)
Reads characters up to the specific char.- Parameters:
c
- the delimiter- Returns:
- the chars read
-
consumeTo
java.lang.String consumeTo(java.lang.String seq)
-
consumeToAny
public java.lang.String consumeToAny(char... chars)
Read characters until the first of any delimiters is found.- Parameters:
chars
- delimiters to scan for- Returns:
- characters read up to the matched delimiter.
-
consumeToAnySorted
java.lang.String consumeToAnySorted(char... chars)
-
consumeData
java.lang.String consumeData()
-
consumeAttributeQuoted
java.lang.String consumeAttributeQuoted(boolean single)
-
consumeRawData
java.lang.String consumeRawData()
-
consumeTagName
java.lang.String consumeTagName()
-
consumeToEnd
java.lang.String consumeToEnd()
-
consumeLetterSequence
java.lang.String consumeLetterSequence()
-
consumeLetterThenDigitSequence
java.lang.String consumeLetterThenDigitSequence()
-
consumeHexSequence
java.lang.String consumeHexSequence()
-
consumeDigitSequence
java.lang.String consumeDigitSequence()
-
matches
boolean matches(char c)
-
matches
boolean matches(java.lang.String seq)
-
matchesIgnoreCase
boolean matchesIgnoreCase(java.lang.String seq)
-
matchesAny
boolean matchesAny(char... seq)
-
matchesAnySorted
boolean matchesAnySorted(char[] seq)
-
matchesLetter
boolean matchesLetter()
-
matchesAsciiAlpha
boolean matchesAsciiAlpha()
Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha- Returns:
- if it matches or not
-
matchesDigit
boolean matchesDigit()
-
matchConsume
boolean matchConsume(java.lang.String seq)
-
matchConsumeIgnoreCase
boolean matchConsumeIgnoreCase(java.lang.String seq)
-
containsIgnoreCase
boolean containsIgnoreCase(java.lang.String seq)
Used to check presence of , when we're in RCData and see a-
toString
public java.lang.String toString()
- Overrides:
toString
in classjava.lang.Object
-
cacheString
private static java.lang.String cacheString(char[] charBuf, java.lang.String[] stringCache, int start, int count)
Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. That saves both having to create objects as hash keys, and running through the entry list, at the expense of some more duplicates.
-
rangeEquals
static boolean rangeEquals(char[] charBuf, int start, int count, java.lang.String cached)
Check if the value of the provided range equals the string.
-
rangeEquals
boolean rangeEquals(int start, int count, java.lang.String cached)
-
-
-