/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2024 Maarten L. Hekkelman
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "zeem/parser.hpp"

#include "zeem/doctype.hpp"
#include "zeem/text.hpp"
#include "zeem/version.hpp"

#include <algorithm>
#include <array>
#include <cassert>
#include <cctype>
#include <compare>
#include <cstddef>
#include <cstdint>
#include <format>
#include <map>
#include <memory>
#include <ranges>
#include <set>
#include <sstream>
#include <stack>
#include <string>
#include <tuple>
#include <utility>
#include <variant>
#include <vector>

namespace zeem
{

std::string to_hex(uint32_t i)
{
	char s[sizeof(i) * 2 + 3];
	char *p = s + sizeof(s);
	*--p = 0;

	const char kHexChars[] = "0123456789abcdef";

	while (i)
	{
		*--p = kHexChars[i & 0x0F];
		i >>= 4;
	}

	*--p = 'x';
	*--p = '0';

	return p;
}

/// \brief our own implementation of iequals: compares \a a with \a b case-insensitive
///
/// This is a limited use function, works only reliably with ASCII. But that's OK.
bool iequals(std::string_view a, std::string_view b)
{
	bool equal = a.length() == b.length();

	for (std::string::size_type i = 0; equal and i < a.length(); ++i)
		equal = std::toupper(a[i]) == std::toupper(b[i]);

	return equal;
}

bool is_absolute_path(std::string_view s)
{
	bool result = false;

	if (not s.empty())
	{
		if (s[0] == '/')
			result = true;
		else if (isalpha(s[0]))
		{
			auto ch = s.begin() + 1;
			while (ch != s.end() and isalpha(*ch))
				++ch;
			result = ch != s.end() and *ch == ':';
		}
	}

	return result;
}

bool is_valid_url(std::string_view url)
{
	// The rules for url in namespaces are a bit different from the URI requirements in RFC3986
	auto cp = url.find(':');

	return cp > 1 and cp != std::string::npos and std::isalpha(url[0]);
}

// parsing XML is somewhat like macro processing,
// we can encounter entities that need to be expanded into replacement text
// and so we declare data_source objects that can be stacked.

// exception generated by data_source

class source_exception : public exception
{
  public:
	explicit source_exception(std::string msg)
		: exception(msg)
		, m_wmsg(std::move(msg))
	{
	}

	std::string m_wmsg;
};

// A data source can have a base dir which is the directory the data came from.
// This information is needed when a relative uri is found in an external ID.

class data_source
{
  public:
	data_source(const data_source &) = delete;
	data_source &operator=(const data_source &) = delete;

	data_source()
		: m_base(".")
	{
		static int s_next_id = 0;
		m_id = s_next_id++;
	}

	virtual ~data_source() = default;

	// data_source is a virtual base class. Derivatives need to declare the next function.
	virtual char32_t get_next_char() = 0;

	void base(std::string dir) { m_base = std::move(dir); }
	[[nodiscard]] const std::string &base() const { return m_base; }

	[[nodiscard]] encoding_type encoding() const { return m_encoding; }
	virtual void encoding(encoding_type enc) { m_encoding = enc; }
	virtual bool has_bom() { return false; }

	void version(version_type v) { m_version = v; }

	[[nodiscard]] int id() const { return m_id; }

	[[nodiscard]] int line_nr() const { return m_line_nr; }
	void line_nr(int l) { m_line_nr = l; }

  protected:
	std::string m_base;
	encoding_type m_encoding{ encoding_type::UTF8 };
	version_type m_version{ 1, 0 };
	int m_id;        // for nesting checks
	int m_line_nr{}; // for reporting errors
};

// --------------------------------------------------------------------
// An std::istream implementation of data_source.

class istream_data_source : public data_source
{
  public:
	explicit istream_data_source(std::istream &data)
		: m_data(&data)
		, m_owns_data(false)
	{
		guess_encoding();
	}

	explicit istream_data_source(std::istream *data)
		: m_data(data)
	{
		guess_encoding();
	}

	~istream_data_source() override
	{
		if (m_owns_data)
			delete m_data;
	}

	bool has_bom() override { return m_has_bom; }

	char32_t get_next_char() override;
	void encoding(encoding_type enc) override;

  private:
	void guess_encoding();
	void parse_text_decl();
	char32_t next_utf8_char();
	char32_t next_utf16le_char();
	char32_t next_utf16be_char();
	char32_t next_iso88591_char();
	char32_t next_ascii_char();

	char8_t next_byte()
	{
		int result = m_data->rdbuf()->sbumpc();

		if (result == std::streambuf::traits_type::eof())
			result = 0;

		return static_cast<char8_t>(result);
	}

	std::istream *m_data;
	bool m_owns_data = true;
	char32_t m_char_buffer = 0; // used in detecting \r\n algorithm

	using next_func = char32_t (istream_data_source::*)();

	next_func m_next{};
	bool m_has_bom = false;
};

void istream_data_source::guess_encoding()
{
	// see if there is a BOM
	// if there isn't, we assume the data is UTF-8

	int ch = m_data->rdbuf()->sgetc();
	if (ch != std::streambuf::traits_type::eof())
	{
		char ch1 = static_cast<char>(ch);

		if (ch1 == static_cast<char>(0xfe))
		{
			char ch2 = static_cast<char>(m_data->rdbuf()->snextc());

			if (ch2 == static_cast<char>(0xff))
			{
				m_data->rdbuf()->snextc();
				m_encoding = encoding_type::UTF16BE;
				m_has_bom = true;
			}
			else
				m_data->rdbuf()->sungetc();
		}
		else if (ch1 == static_cast<char>(0xff))
		{
			char ch2 = static_cast<char>(m_data->rdbuf()->snextc());

			if (ch2 == static_cast<char>(0xfe))
			{
				m_data->rdbuf()->snextc();
				m_encoding = encoding_type::UTF16LE;
				m_has_bom = true;
			}
			else
				m_data->rdbuf()->sungetc();
		}
		else if (ch1 == static_cast<char>(0xef))
		{
			char ch2 = static_cast<char>(m_data->rdbuf()->snextc());
			char ch3 = static_cast<char>(m_data->rdbuf()->snextc());

			if (ch2 == static_cast<char>(0xbb) and ch3 == static_cast<char>(0xbf))
			{
				m_data->rdbuf()->snextc();
				m_encoding = encoding_type::UTF8;
				m_has_bom = true;
			}
			else
			{
				m_data->rdbuf()->sungetc();
				m_data->rdbuf()->sputbackc(ch1);
			}
		}
	}

	encoding(m_encoding);
}

/// \brief utf-8 is not single byte e.g.
constexpr bool is_single_byte_encoding(encoding_type enc)
{
	return enc == encoding_type::ASCII or enc == encoding_type::ISO88591 or enc == encoding_type::UTF8;
}

void istream_data_source::encoding(encoding_type enc)
{
	if (enc != m_encoding)
	{
		if (is_single_byte_encoding(enc) and is_single_byte_encoding(m_encoding))
			m_encoding = enc;
		else
			throw invalid_exception("Invalid encoding specified, incompatible with actual encoding");
	}

	data_source::encoding(enc);

	switch (m_encoding)
	{
		case encoding_type::UTF8:
			m_next = &istream_data_source::next_utf8_char;
			break;
		case encoding_type::UTF16LE:
			m_next = &istream_data_source::next_utf16le_char;
			break;
		case encoding_type::UTF16BE:
			m_next = &istream_data_source::next_utf16be_char;
			break;
		case encoding_type::ISO88591:
			m_next = &istream_data_source::next_iso88591_char;
			break;
		case encoding_type::ASCII:
			m_next = &istream_data_source::next_ascii_char;
			break;
		default: break;
	}
}

char32_t istream_data_source::next_utf8_char()
{
	int result = next_byte();

	if (result & 0x080)
	{
		char8_t ch[3];

		if ((result & 0x0E0) == 0x0C0)
		{
			ch[0] = next_byte();
			if ((ch[0] & 0x0c0) != 0x080)
				throw source_exception("Invalid utf-8");
			result = ((result & 0x01F) << 6) | (ch[0] & 0x03F);
		}
		else if ((result & 0x0F0) == 0x0E0)
		{
			ch[0] = next_byte();
			ch[1] = next_byte();
			if ((ch[0] & 0x0c0) != 0x080 or (ch[1] & 0x0c0) != 0x080)
				throw source_exception("Invalid utf-8");
			result = ((result & 0x00F) << 12) | ((ch[0] & 0x03F) << 6) | (ch[1] & 0x03F);
		}
		else if ((result & 0x0F8) == 0x0F0)
		{
			ch[0] = next_byte();
			ch[1] = next_byte();
			ch[2] = next_byte();
			if ((ch[0] & 0x0c0) != 0x080 or (ch[1] & 0x0c0) != 0x080 or (ch[2] & 0x0c0) != 0x080)
				throw source_exception("Invalid utf-8");
			result = ((result & 0x007) << 18) | ((ch[0] & 0x03F) << 12) | ((ch[1] & 0x03F) << 6) | (ch[2] & 0x03F);

			if (result > 0x10ffff)
				throw source_exception("invalid utf-8 character (out of range)");
		}
	}

	return result;
}

char32_t istream_data_source::next_utf16le_char()
{
	char8_t c1 = next_byte(), c2 = next_byte();

	char32_t ch = (static_cast<char32_t>(c2) << 8) | c1;

	if (ch >= 0x080)
	{
		// surrogate support
		if (ch >= 0x0D800 and ch <= 0x0DBFF)
		{
			char32_t uc2 = next_utf16le_char();
			if (uc2 >= 0x0DC00 and uc2 <= 0x0DFFF)
				ch = (ch - 0x0D800) * 0x400 + (uc2 - 0x0DC00) + 0x010000;
			else
				throw not_wf_exception("Document (line: " + std::to_string(m_line_nr) + " not well-formed: leading surrogate character without trailing surrogate character");
		}
		else if (ch >= 0x0DC00 and ch <= 0x0DFFF)
			throw not_wf_exception("Document (line: " + std::to_string(m_line_nr) + " not well-formed: trailing surrogate character without a leading surrogate");
	}

	return ch;
}

char32_t istream_data_source::next_utf16be_char()
{
	char8_t c1 = next_byte(), c2 = next_byte();

	char32_t ch = (static_cast<char32_t>(c1) << 8) | c2;

	if (ch >= 0x080)
	{
		// surrogate support
		if (ch >= 0x0D800 and ch <= 0x0DBFF)
		{
			char32_t uc2 = next_utf16be_char();
			if (uc2 >= 0x0DC00 and uc2 <= 0x0DFFF)
				ch = (ch - 0x0D800) * 0x400 + (uc2 - 0x0DC00) + 0x010000;
			else
				throw not_wf_exception("Document (line: " + std::to_string(m_line_nr) + " not well-formed: leading surrogate character without trailing surrogate character");
		}
		else if (ch >= 0x0DC00 and ch <= 0x0DFFF)
			throw not_wf_exception("Document (line: " + std::to_string(m_line_nr) + " not well-formed: trailing surrogate character without a leading surrogate");
	}

	return ch;
}

char32_t istream_data_source::next_iso88591_char()
{
	return static_cast<char32_t>(next_byte());
}

char32_t istream_data_source::next_ascii_char()
{
	int c = next_byte();

	if (c > 127)
		throw not_wf_exception("Invalid ascii value");

	return c;
}

char32_t istream_data_source::get_next_char()
{
	char32_t ch = m_char_buffer;

	if (ch == 0)
		ch = (this->*m_next)();
	else
		m_char_buffer = 0;

	if (ch == 0x0ffff or ch == 0x0fffe)
		throw not_wf_exception("Document (line: " + std::to_string(m_line_nr) + " not well-formed: character " + to_hex(ch) + " is not allowed");

	if (ch == '\r')
	{
		ch = (this->*m_next)();
		if (ch != '\n' and (m_version == version_type{ 1, 0 } or ch != 0x85 or m_encoding == encoding_type::ASCII))
			m_char_buffer = ch;
		ch = '\n';
	}

	if (m_encoding != encoding_type::ASCII)
	{
		if ((m_version > version_type{ 1, 0 } and ch == 0x85) or
			(m_encoding != encoding_type::ISO88591 and m_version > version_type{ 1, 0 } and ch == 0x2028))
			ch = '\n';
	}

	if (ch == '\n')
		++m_line_nr;

	return ch;
}

// --------------------------------------------------------------------

class string_data_source : public data_source
{
  public:
	explicit string_data_source(std::string data)
		: m_data(std::move(data))
		, m_ptr(m_data.cbegin())
	{
	}

	char32_t get_next_char() override
	{
		char32_t result = 0;

		if (m_ptr != m_data.end())
			result = pop_front_char(m_ptr, m_data.cend());

		if (result == '\n')
			++m_line_nr;

		return result;
	}

  private:
	std::string m_data;
	std::string::const_iterator m_ptr;
};

// --------------------------------------------------------------------

class entity_data_source : public string_data_source
{
  public:
	entity_data_source(std::string text, std::string entity_path)
		: string_data_source(std::move(text))
	{
		base(std::move(entity_path));
	}
};

// --------------------------------------------------------------------

class parameter_entity_data_source : public string_data_source
{
  public:
	parameter_entity_data_source(const std::string &data, std::string base_dir)
		: string_data_source(" " + data + " ")
	{
		base(std::move(base_dir));
	}
};

// --------------------------------------------------------------------

class valid_nesting_validator
{
  public:
	explicit valid_nesting_validator(data_source &source)
		: m_id(source.id())
	{
	}

	void check(data_source &source)
	{
		if (source.id() != m_id)
			throw invalid_exception("proper nesting validation error");
	}

  private:
	int m_id;
};

// --------------------------------------------------------------------

struct parser_imp
{
	parser_imp(std::istream &data, parser &parser);

	~parser_imp();

	// Here comes the parser part
	void parse(bool validate, bool validate_ns);

	// the productions. Some are inlined below for obvious reasons.
	// names of the productions try to follow those in the TR http://www.w3.org/TR/xml
	void prolog();
	void xml_decl();
	void text_decl();

	void s(bool at_least_one = false);
	void eq();
	void misc();
	void element(doctype::validator &valid);
	void content(doctype::validator &valid);

	void comment();
	void pi();

	void pereference();

	void doctypedecl();
	data_source *get_data_source(std::string_view pubid, std::string uri);
	std::tuple<std::string, std::string> read_external_id();
	void intsubset();
	void extsubset();
	void declsep();
	void conditionalsect();
	void ignoresectcontents();
	void markup_decl();
	void element_decl();
	void contentspec(doctype::element &element);
	doctype::content_spec_base_ptr cp();
	void attlist_decl();
	void notation_decl();
	void entity_decl();
	void parameter_entity_decl();
	void general_entity_decl();

	// at several locations we need to parse out entity references from strings:
	void parse_parameter_entity_declaration(std::string &s);
	void parse_general_entity_declaration(std::string &s);

	// same goes for attribute values
	std::string normalize_attribute_value(const std::string &s, bool isCDATA)
	{
		push_data_source(new string_data_source(s), false);

		std::string result = normalize_attribute_value();

		if (m_standalone and result != s)
			not_valid("Document cannot be standalone since an attribute was modified");

		if (not isCDATA)
			collapse_spaces(result);

		return result;
	}

	std::string normalize_attribute_value();

	void collapse_spaces(std::string &s);

	// The scanner is next. We recognize the following tokens:
	enum class XMLToken
	{
		Undef = 0,

		Eq = '=',
		QuestionMark = '?',
		GreaterThan = '>',
		OpenBracket = '[',
		CloseBracket = ']',
		OpenParenthesis = '(',
		CloseParenthesis = ')',
		Percent = '%',
		Plus = '+',
		Pipe = '|',
		Asterisk = '*',
		Slash = '/',
		Comma = ',',

		Eof = 256,
		Other, //

		// these are tokens for the markup

		XMLDecl,  // <?xml
		Space,    // Really needed
		Comment,  // <!--
		Name,     // name-start-char (name-char)*
		NMToken,  // (name-char)+
		String,   // (\"[^"]*\") | (\'[^\']*\')		// single or double quoted std::string
		PI,       // <?
		STag,     // <
		ETag,     // </
		DocType,  // <!DOCTYPE
		Element,  // <!ELEMENT
		AttList,  // <!ATTLIST
		Entity,   // <!ENTITY
		Notation, // <!NOTATION

		Required, // #REQUIRED
		Implied,  // #IMPLIED
		PCData,   // #PCDATA
		Fixed,    // #FIXED

		IncludeIgnore, // <![

		PEReference, // %name;

		// next are tokens for the content part

		CharRef,   // &#xx; en zo
		Reference, // &name;
		CDSect,    // CData section <![CDATA[ ... ]]>
		Content,   // anything else up to the next element start
	};

	// for debugging and error reporting we have the following describing routine
	constexpr const char *describe_token(XMLToken token)
	{
		switch (token)
		{
			case XMLToken::Undef: return "undefined";
			case XMLToken::Eq: return "=";
			case XMLToken::QuestionMark: return "?";
			case XMLToken::GreaterThan: return ">";
			case XMLToken::OpenBracket: return "[";
			case XMLToken::CloseBracket: return "]";
			case XMLToken::OpenParenthesis: return "(";
			case XMLToken::CloseParenthesis: return ")";
			case XMLToken::Percent: return "%";
			case XMLToken::Plus: return "+";
			case XMLToken::Pipe: return "|";
			case XMLToken::Asterisk: return "*";
			case XMLToken::Slash: return "/";
			case XMLToken::Comma: return ",";
			case XMLToken::Eof: return "end of file";
			case XMLToken::Other: return "an invalid character";
			case XMLToken::XMLDecl: return "'<?xml'";
			case XMLToken::Space: return "space character";
			case XMLToken::Comment: return "comment";
			case XMLToken::Name: return "identifier or name";
			case XMLToken::NMToken: return "nmtoken";
			case XMLToken::String: return "quoted string";
			case XMLToken::PI: return "processing instruction";
			case XMLToken::STag: return "tag";
			case XMLToken::ETag: return "end tag";
			case XMLToken::DocType: return "<!DOCTYPE";
			case XMLToken::Element: return "<!ELEMENT";
			case XMLToken::AttList: return "<!ATTLIST";
			case XMLToken::Entity: return "<!ENTITY";
			case XMLToken::Notation: return "<!NOTATION";
			case XMLToken::Required: return "#REQUIRED";
			case XMLToken::Implied: return "#IMPLIED";
			case XMLToken::Fixed: return "#FIXED";
			case XMLToken::PCData: return "#PCData";
			case XMLToken::PEReference: return "parameter entity reference";
			case XMLToken::CharRef: return "character reference";
			case XMLToken::Reference: return "entity reference";
			case XMLToken::CDSect: return "CDATA section";
			case XMLToken::Content: return "content";
			case XMLToken::IncludeIgnore: return "<![ (as in <![INCLUDE[ )";
			default: assert(false); return "unknown token";
		}
	}

	char32_t get_next_char();

	// Recognizing tokens differs if we are expecting markup or content in elements:
	XMLToken get_next_token();
	XMLToken get_next_content();

	// retract is used when we've read a character too much from the input stream
	void retract();

	// match, check if the look-a-head token is really what we expect here.
	// throws if it isn't. Depending on the content flag we call either get_next_token or get_next_content
	// to find the next look-a-head token.
	void match(XMLToken token);

	// utility routine
	version_type parse_version();

	// error handling routines
	[[noreturn]] void not_well_formed(const std::string &msg) const;
	void not_valid(std::string msg) const;

	// doctype support
	[[nodiscard]] const doctype::entity &get_general_entity(std::string_view name) const;
	[[nodiscard]] const doctype::entity &get_parameter_entity(std::string_view name) const;
	[[nodiscard]] const doctype::element_ptr get_element(std::string_view name) const;

	struct save_state
	{
		save_state(bool &state, bool v)
			: m_state(state)
			, m_saved(state)
		{
			state = v;
		}

		~save_state()
		{
			m_state = m_saved;
		}

		void reset()
		{
			m_state = m_saved;
		}

	  private:
		bool &m_state;
		bool m_saved;
	};

	struct source_state
	{
		source_state(parser_imp *impl, data_source *source, bool insert)
			: m_impl(*impl)
			, m_source(source)
			, m_buffer_offset(m_impl.m_buffer_ptr - m_impl.m_buffer.begin())
			, m_lookahead(m_impl.m_lookahead)
			, m_inserted(insert)
		{
			std::swap(m_token, m_impl.m_token);
			std::swap(m_buffer, m_impl.m_buffer);
			m_impl.m_buffer_ptr = m_impl.m_buffer.begin();
		}

		~source_state()
		{
			std::swap(m_token, m_impl.m_token);
			std::swap(m_buffer, m_impl.m_buffer);
			m_impl.m_buffer_ptr = m_impl.m_buffer.begin() + m_buffer_offset;
			m_impl.m_lookahead = m_lookahead;
			delete m_source;
		}

		data_source *operator->() const { return m_source; }
		data_source &operator*() const { return *m_source; }

		[[nodiscard]] bool inserted() const { return m_inserted; }

		parser_imp &m_impl;
		data_source *m_source;
		std::array<char32_t, 4> m_buffer{};
		std::ptrdiff_t m_buffer_offset;
		XMLToken m_lookahead;
		std::string m_token;
		bool m_inserted;
	};

	void push_data_source(data_source *source, bool insert)
	{
		source->version(m_version);
		m_source.emplace(this, source, insert);
	}

	void pop_data_source()
	{
		assert(not m_source.empty());
		m_source.pop();
	}

	// And during parsing we keep track of the namespaces we encounter.
	class ns_state
	{
	  public:
		explicit ns_state(parser_imp *imp)
			: m_parser_imp(imp)
			, m_next(imp->m_ns)
		{
			m_parser_imp->m_ns = this;
		}

		~ns_state()
		{
			m_parser_imp->m_ns = m_next;
		}

		std::string default_ns()
		{
			std::string result = m_default_ns;
			if (result.empty() and m_next != nullptr)
				result = m_next->default_ns();
			return result;
		}

		void default_ns(std::string ns)
		{
			m_default_ns = std::move(ns);
		}

		std::string ns_for_prefix(const std::string &prefix)
		{
			std::string result;

			if (m_unbound.count(prefix) == 0)
			{
				auto np = m_known.find(prefix);
				if (np != m_known.end())
					result = np->second;
				else if (m_next != nullptr)
					result = m_next->ns_for_prefix(prefix);
			}

			return result;
		}

		void bind(const std::string &prefix, std::string uri)
		{
			m_known[prefix] = std::move(uri);
		}

		void unbind(std::string prefix)
		{
			m_unbound.insert(std::move(prefix));
		}

		bool is_known_prefix(const std::string &prefix)
		{
			bool result = false;

			if (not m_unbound.count(prefix))
			{
				if (m_known.count(prefix))
					result = true;
				else if (m_next != nullptr)
					result = m_next->is_known_prefix(prefix);
			}

			return result;
		}

		bool is_known_uri(const std::string &uri)
		{
			for (const auto &k : m_known)
			{
				if (k.second == uri)
					return true;
			}

			return m_next != nullptr and m_next->is_known_uri(uri);
		}

	  private:
		parser_imp *m_parser_imp;
		std::string m_default_ns;
		ns_state *m_next;

		std::map<std::string, std::string> m_known;
		std::set<std::string> m_unbound;
	};

	bool is_char(char32_t uc)
	{
		return m_version == version_type{ 1, 0 } ? is_valid_xml_1_0_char(uc) : is_valid_xml_1_1_char(uc);
	}

	bool is_space(char32_t uc)
	{
		return uc == ' ' or uc == '\t' or uc == '\n' or uc == '\r';
	}

	bool is_space(std::string_view s)
	{
		return not s.empty() and s.find_first_not_of(" \t\r\n") == std::string_view::npos;
	}

	bool is_referrable_char(char32_t charref)
	{
		return m_version == version_type{ 1, 0 }
		           ? charref == 0x09 or
		                 charref == 0x0A or
		                 charref == 0x0D or
		                 (charref > 0x01F and charref < 0x0D800) or
		                 (charref > 0x0DFFF and charref < 0x0FFFE) or
		                 (charref > 0x0FFFF and charref < 0x00110000)
		           :

		           // 1.1
		           (charref > 0x0 and charref < 0x0D800) or
		               (charref > 0x0DFFF and charref < 0x0FFFE) or
		               (charref > 0x0FFFF and charref < 0x00110000);
	}

	parser &m_parser;
	bool m_validating{};
	bool m_validating_ns{};
	bool m_has_dtd{};
	bool m_is_html5 = false; // needed to see if we can use built in named characters
	XMLToken m_lookahead{ XMLToken::Eof };
	std::string m_token;

	std::stack<source_state> m_source;

	std::array<char32_t, 4> m_buffer{};
	std::array<char32_t, 4>::iterator m_buffer_ptr = m_buffer.begin();

	version_type m_version{ 1, 0 };
	encoding_type m_encoding = encoding_type::UTF8;
	bool m_standalone{};

	// parser state
	bool m_external_subset = false;
	bool m_internal_subset = false;
	bool m_allow_peref = false;
	bool m_in_declsep = false;
	bool m_in_external_dtd = false;
	bool m_in_content = false;

	std::vector<std::string> m_entities_on_stack;
	ns_state *m_ns{};

	std::string m_root_element;
	doctype::entity_list m_parameter_entities;
	doctype::entity_list m_general_entities;
	doctype::element_list m_doctype;

	std::set<std::string> m_notations;
	std::set<std::string> m_ids;            // attributes of type ID should be unique
	std::set<std::string> m_unresolved_ids; // keep track of IDREFS that were not found yet

	doctype::attribute_ptr m_xmlSpaceAttr;
};

// --------------------------------------------------------------------
// some inlines

inline void parser_imp::s(bool at_least_one)
{
	if (at_least_one)
		match(XMLToken::Space);

	while (m_lookahead == XMLToken::Space)
		match(XMLToken::Space);
}

inline void parser_imp::eq()
{
	s();
	match(XMLToken::Eq);
	s();
}

// --------------------------------------------------------------------

parser_imp::parser_imp(std::istream &data, parser &parser)
	: m_parser(parser)
	, m_encoding(encoding_type::ASCII)
{
	push_data_source(new istream_data_source(data), false);

	m_encoding = m_source.top()->encoding();

	// these entities are always recognized:
	m_general_entities.push_back(std::make_shared<doctype::general_entity>("lt", "&#60;"));
	m_general_entities.push_back(std::make_shared<doctype::general_entity>("gt", "&#62;"));
	m_general_entities.push_back(std::make_shared<doctype::general_entity>("amp", "&#38;"));
	m_general_entities.push_back(std::make_shared<doctype::general_entity>("apos", "&#39;"));
	m_general_entities.push_back(std::make_shared<doctype::general_entity>("quot", "&#34;"));

	// m_xmlSpaceAttr.reset(new doctype::attribute("xml:space", doctype::attribute_type::Enumerated, { "preserve", "default" }));
	m_xmlSpaceAttr = std::make_shared<doctype::attribute>("xml:space", doctype::attribute_type::Enumerated, std::vector<std::string>{ "preserve", "default" });
}

parser_imp::~parser_imp()
{
	while (not m_source.empty())
		m_source.pop();
}

const doctype::entity &parser_imp::get_general_entity(std::string_view name) const
{
	for (const auto &e : m_general_entities)
	{
		if (e->name() == name)
		{
			if (e->is_external() and m_standalone)
				not_valid("Document cannot be standalone since entity " + std::string{ name } + " is defined externally");

			return *e;
		}
	}

	if (m_is_html5)
	{
		auto c = doctype::get_named_character(name);
		if (c != nullptr)
			return *c;
	}

	not_well_formed("undefined entity reference '" + std::string{ name } + "'");
}

const doctype::entity &parser_imp::get_parameter_entity(std::string_view name) const
{
	for (const auto &e : m_parameter_entities)
	{
		if (e->name() == name)
			return *e;
	}

	not_well_formed("Undefined parameter entity '" + m_token + '\'');
}

const doctype::element_ptr parser_imp::get_element(std::string_view name) const
{
	doctype::element_ptr result;

	for (const auto &e : m_doctype)
	{
		if (e->name() == name)
		{
			result = e;
			break;
		}
	}

	return result;
}

char32_t parser_imp::get_next_char()
{
	char32_t result = 0;

	if (m_buffer_ptr > m_buffer.begin()) // if buffer is not empty we already did all the validity checks
		result = *--m_buffer_ptr;

	if (result == 0)
	{
		while (not m_source.empty())
		{
			try
			{
				result = m_source.top()->get_next_char();
			}
			catch (source_exception &e)
			{
				not_well_formed(e.m_wmsg);
			}

			if (result == 0 and m_source.top().inserted())
			{
				m_source.pop();
				continue;
			}

			break;
		}
	}

	append(m_token, result);

	return result;
}

void parser_imp::retract()
{
	assert(not m_token.empty());

	assert(m_buffer_ptr < m_buffer.end());
	*m_buffer_ptr++ = pop_back_char(m_token);
}

void parser_imp::match(XMLToken token)
{
	if (m_lookahead != token)
	{
		std::string expected = describe_token(token);
		std::string found = describe_token(m_lookahead);

		not_well_formed(
			"Error parsing XML, expected '" + expected + "' but found '" + found + "' ('" + m_token + "')");
	}

	if (m_in_content)
		m_lookahead = get_next_content();
	else
	{
		m_lookahead = get_next_token();

		if (m_lookahead == XMLToken::PEReference and not m_in_declsep)
		{
			if (m_allow_peref)
				pereference();
			else
				not_well_formed("Invalid entity reference at this location");
		}
	}
}

void parser_imp::not_well_formed(const std::string &msg) const
{
	throw not_wf_exception(
		m_source.empty()
			? std::format("Document not well-formed: {}", msg)
			: std::format("Document (line: {}) not well-formed: {}", m_source.top()->line_nr(), msg));
}

void parser_imp::not_valid(std::string msg) const
{
	if (m_validating)
	{
		throw invalid_exception(
			m_source.empty()
				? std::format("Document not valid: {}", msg)
				: std::format("Document (line: {}) not valid: {}", m_source.top()->line_nr(), msg));
	}
	else
		m_parser.report_invalidation(std::move(msg));
}

/*
    get_next_token is a hand optimised scanner for tokens in the input stream.
*/

parser_imp::XMLToken parser_imp::get_next_token()
{
	enum State
	{
		state_Start = 0,
		state_WhiteSpace = 10,
		state_Tag = 20,
		state_String = 30,
		state_PERef = 40,
		state_HashName = 49,
		state_Name = 50,
		state_CommentOrDoctype = 60,
		state_Comment = 70,
		state_DocTypeDecl = 80,
		state_PI = 90,
	};

	XMLToken token = XMLToken::Undef;
	char32_t quote_char = 0;
	int state = state_Start;
	bool might_be_name = false;

	m_token.clear();

	while (token == XMLToken::Undef)
	{
		char32_t uc = get_next_char();

		switch (state)
		{
			// start scanning.
			case state_Start:
				switch (uc)
				{
					case 0:
						token = XMLToken::Eof;
						break;

					case ' ':
					case '\t':
					case '\n':
						state = state_WhiteSpace;
						break;

					case '<':
						state = state_Tag;
						break;

					case '\'':
					case '"':
					{
						state = state_String;
						quote_char = uc;
						break;
					}

					case '%':
						state = state_PERef;
						break;

					case '#':
						state = state_HashName;
						break;

					case '=': token = XMLToken::Eq; break;
					case '?': token = XMLToken::QuestionMark; break;
					case '>': token = XMLToken::GreaterThan; break;
					case '[': token = XMLToken::OpenBracket; break;
					case ']': token = XMLToken::CloseBracket; break;
					case '(': token = XMLToken::OpenParenthesis; break;
					case ')': token = XMLToken::CloseParenthesis; break;
					case '+': token = XMLToken::Plus; break;
					case '|': token = XMLToken::Pipe; break;
					case '*': token = XMLToken::Asterisk; break;
					case '/': token = XMLToken::Slash; break;
					case ',': token = XMLToken::Comma; break;

					default:
						if (is_name_start_char(uc))
						{
							might_be_name = true;
							state = state_Name;
						}
						else if (is_name_char(uc))
							state = state_Name;
						else if (is_char(uc))
							token = XMLToken::Other;
						else
							not_well_formed("Unexpected character: " + ((uc < 128 and std::isprint(static_cast<int>(uc))) ? std::string(1, static_cast<char>(uc)) : to_hex(uc)));

						break;
				}
				break;

			// collect all whitespace
			case state_WhiteSpace:
				if (uc != ' ' and uc != '\t' and uc != '\n')
				{
					retract();
					token = XMLToken::Space;
				}
				break;

			// We scanned a < character, decide what to do next.
			case state_Tag:
				if (uc == '!') // comment or doctype thing
					state = state_CommentOrDoctype;
				else if (uc == '/') // end tag
					token = XMLToken::ETag;
				else if (uc == '?') // processing instruction
					state = state_PI;
				else // anything else
				{
					retract();
					token = XMLToken::STag;
				}
				break;

			// So we had <! which can only be followed validly by '-', '[' or a character at the current location
			case state_CommentOrDoctype:
				if (uc == '-')
					state = state_Comment;
				else if (uc == '[' /*and m_external_subset*/)
					token = XMLToken::IncludeIgnore;
				else if (is_name_start_char(uc))
					state = state_DocTypeDecl;
				else
					not_well_formed("Unexpected character");
				break;

			// Comment, strictly check for <!-- -->
			case state_Comment:
				if (uc == '-')
					token = XMLToken::Comment;
				else
					not_well_formed("Invalid formatted comment");
				break;

			// scan for processing instructions
			case state_PI:
				if (not is_name_char(uc))
				{
					retract();

					// we treat the xml processing instruction separately.
					if (m_token.substr(2) == "xml")
						token = XMLToken::XMLDecl;
					else if (iequals(m_token.substr(2), "xml"))
						not_well_formed("<?XML is neither an XML declaration nor a legal processing instruction target");
					else
						token = XMLToken::PI;
				}
				break;

			// One of the DOCTYPE tags. We scanned <!(char), continue until non-char
			case state_DocTypeDecl:
				if (not is_name_char(uc))
				{
					retract();

					if (m_token == "<!DOCTYPE")
						token = XMLToken::DocType;
					else if (m_token == "<!ELEMENT")
						token = XMLToken::Element;
					else if (m_token == "<!ATTLIST")
						token = XMLToken::AttList;
					else if (m_token == "<!ENTITY")
						token = XMLToken::Entity;
					else if (m_token == "<!NOTATION")
						token = XMLToken::Notation;
					else
						not_well_formed("invalid doctype declaration '" + m_token + "'");
				}
				break;

			// strings
			case state_String:
				if (uc == quote_char)
				{
					token = XMLToken::String;
					m_token = m_token.substr(1, m_token.length() - 2);
				}
				else if (uc == 0)
					not_well_formed("unexpected end of file, runaway std::string");
				break;

			// Names
			case state_HashName:
				if (not is_name_char(uc))
				{
					retract();

					if (m_token == "#PCDATA")
						token = XMLToken::PCData;
					else if (m_token == "#FIXED")
						token = XMLToken::Fixed;
					else if (m_token == "#IMPLIED")
						token = XMLToken::Implied;
					else if (m_token == "#REQUIRED")
						token = XMLToken::Required;
					else
						not_well_formed("Unexpected token " + m_token);
				}
				break;

			case state_Name:
				if (not is_name_char(uc))
				{
					retract();

					if (might_be_name)
						token = XMLToken::Name;
					else
						token = XMLToken::NMToken;
				}
				break;

			// parameter entity references
			case state_PERef:
				if (is_name_start_char(uc))
					state += 1;
				else
				{
					retract();
					token = XMLToken::Percent;
				}
				break;

			case state_PERef + 1:
				if (uc == ';')
				{
					m_token = m_token.substr(1, m_token.length() - 2);
					token = XMLToken::PEReference;
				}
				else if (not is_name_char(uc))
					not_well_formed("invalid parameter entity reference");
				break;

			default:
				assert(false);
				not_well_formed("state should never be reached");
		}
	}

	return token;
}

parser_imp::XMLToken parser_imp::get_next_content()
{
	enum State
	{
		state_Start = 10,
		state_Tag = 20,
		state_Reference = 30,
		state_WhiteSpace = 40,
		state_Content = 50,
		state_PI = 60,
		state_CommentOrCDATA = 70,
		state_Comment = 80,
		state_CDATA = 90,
		state_Illegal = 100
	};

	XMLToken token = XMLToken::Undef;
	int state = state_Start;
	char32_t charref = 0;

	m_token.clear();

	while (token == XMLToken::Undef)
	{
		char32_t uc = get_next_char();

		if (uc != 0 and not is_char(uc))
			not_well_formed("illegal character in content: '" + to_hex(uc) + "'");

		switch (state)
		{
			case state_Start:
				switch (uc)
				{
					case 0:
						token = XMLToken::Eof; // end of file reached
						break;

					case '<':
						state = state_Tag; // beginning of a tag
						break;

					case '&':
						state = state_Reference; // a& reference;
						break;

					case ']':
						state = state_Illegal; // avoid ]]> in text
						break;

					case ' ':
					case '\t':
					case '\n':
					case '\r':
						state = state_WhiteSpace;
						break;

					default:
						if (is_char(uc))
							state = state_Content; // anything else
						else
							not_well_formed("Unexpected character in content: " + (std::isprint(static_cast<int>(uc)) ? std::string(1, static_cast<char>(uc)) : to_hex(uc)));
						break;
				}
				break;

			// collect all whitespace
			case state_WhiteSpace:
				if (not is_space(uc))
				{
					retract();
					token = XMLToken::Space;
				}
				break;

			// content. Only stop collecting character when uc is special
			case state_Content:
				if (uc == ']')
					state = state_Illegal;
				else if (uc == 0 or uc == '<' or uc == '&')
				{
					retract();
					token = XMLToken::Content;
				}
				else if (not is_referrable_char(uc))
					not_well_formed("Illegal character in content text");
				break;

			// beginning of a tag?
			case state_Tag:
				if (uc == '/')
					token = XMLToken::ETag;
				else if (uc == '?') // processing instruction
					state = state_PI;
				else if (uc == '!') // comment or CDATA
					state = state_CommentOrCDATA;
				else
				{
					retract();
					token = XMLToken::STag;
				}
				break;

			// processing instructions
			case state_PI:
				if (not is_name_char(uc))
				{
					retract();
					token = XMLToken::PI;
				}
				break;

			// comment or CDATA
			case state_CommentOrCDATA:
				if (uc == '-') // comment
					state = state_Comment;
				else if (uc == '[')
					state = state_CDATA; // CDATA
				else
					not_well_formed("invalid content");
				break;

			case state_Comment:
				if (uc == '-')
					token = XMLToken::Comment;
				else
					not_well_formed("invalid content");
				break;

			// CDATA (we parsed <![ up to this location
			case state_CDATA:
				if (is_name_start_char(uc))
					state += 1;
				else
					not_well_formed("invalid content");
				break;

			case state_CDATA + 1:
				if (uc == '[' and m_token == "<![CDATA[")
					state += 1;
				else if (not is_name_char(uc))
					not_well_formed("invalid content");
				break;

			case state_CDATA + 2:
				if (uc == ']')
					state += 1;
				else if (uc == 0)
					not_well_formed("runaway cdata section");
				break;

			case state_CDATA + 3:
				if (uc == ']')
					state += 1;
				else if (uc == 0)
					not_well_formed("runaway cdata section");
				else if (uc != ']')
					state = state_CDATA + 2;
				break;

			case state_CDATA + 4:
				if (uc == '>')
				{
					token = XMLToken::CDSect;
					m_token = m_token.substr(9, m_token.length() - 12);
				}
				else if (uc == 0)
					not_well_formed("runaway cdata section");
				else if (uc != ']')
					state = state_CDATA + 2;
				break;

			// reference, either a character reference or a general entity reference
			case state_Reference:
				if (uc == '#')
					state = state_Reference + 2;
				else if (is_name_start_char(uc))
					state = state_Reference + 1;
				else
					not_well_formed("stray ampersand found in content");
				break;

			case state_Reference + 1:
				if (not is_name_char(uc))
				{
					if (uc != ';')
						not_well_formed("invalid entity found in content, missing semicolon?");
					token = XMLToken::Reference;
					m_token = m_token.substr(1, m_token.length() - 2);
				}
				break;

			case state_Reference + 2:
				if (uc == 'x')
					state = state_Reference + 4;
				else if (uc >= '0' and uc <= '9')
				{
					charref = uc - '0';
					state += 1;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_Reference + 3:
				if (uc >= '0' and uc <= '9')
					charref = charref * 10 + (uc - '0');
				else if (uc == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character in content text");
					m_token.clear();
					append(m_token, charref);
					token = XMLToken::CharRef;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_Reference + 4:
				if (uc >= 'a' and uc <= 'f')
				{
					charref = uc - 'a' + 10;
					state += 1;
				}
				else if (uc >= 'A' and uc <= 'F')
				{
					charref = uc - 'A' + 10;
					state += 1;
				}
				else if (uc >= '0' and uc <= '9')
				{
					charref = uc - '0';
					state += 1;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_Reference + 5:
				if (uc >= 'a' and uc <= 'f')
					charref = (charref << 4) + (uc - 'a' + 10);
				else if (uc >= 'A' and uc <= 'F')
					charref = (charref << 4) + (uc - 'A' + 10);
				else if (uc >= '0' and uc <= '9')
					charref = (charref << 4) + (uc - '0');
				else if (uc == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character in content text");
					m_token.clear();
					append(m_token, charref);
					token = XMLToken::CharRef;
				}
				else
					not_well_formed("invalid character reference");
				break;

			// ]]> is illegal
			case state_Illegal:
				if (uc == ']')
					state += 1;
				else
				{
					retract();
					state = state_Content;
				}
				break;

			case state_Illegal + 1:
				if (uc == '>')
					not_well_formed("the sequence ']]>' is illegal in content text");
				else if (uc != ']')
				{
					retract();
					retract();
					state = state_Content;
				}
				break;

			default:
				assert(false);
				not_well_formed("state reached that should not be reachable");
		}
	}

	return token;
}

version_type parser_imp::parse_version()
{
	version_type result{ 0, 0 };

	enum
	{
		major,
		minor
	} state = major;

	for (char ch : m_token)
	{
		if (state == major)
		{
			if (ch >= '0' and ch <= '9')
				result.major = result.major * 10 + (ch - '0');
			else if (ch == '.')
				state = minor;
			else
				not_well_formed("Invalid XML version string");
		}
		else
		{
			if (ch >= '0' and ch <= '9')
				result.minor = result.minor * 10 + (ch - '0');
			else
				not_well_formed("Invalid XML version string");
		}
	}

	if (result < version_type{ 1, 0 } or result >= version_type{ 2, 0 })
		not_well_formed("Invalid version specified: '" + m_token + "'");

	return result;
}

void parser_imp::parse(bool validate, bool validate_ns)
{
	m_validating = validate;
	m_validating_ns = validate_ns;

	m_lookahead = get_next_token();

	prolog();

	auto e = get_element(m_root_element);

	if (m_has_dtd and e == nullptr and m_validating)
		not_valid("Element '" + m_root_element + "' is not defined in DTD");

	if (e)
	{
		doctype::content_spec_element allowed(m_root_element);
		doctype::validator valid(allowed);

		element(valid);
	}
	else
	{
		doctype::content_spec_any allowed;
		doctype::validator valid(allowed);

		element(valid);
	}

	misc();

	if (m_lookahead != XMLToken::Eof)
		not_well_formed("garbage at end of file");

	if (not m_unresolved_ids.empty())
	{
		std::ostringstream os;
		os << "document contains references to the following undefined ID's: '";
		for (bool first = true; auto &id : m_unresolved_ids)
		{
			if (not std::exchange(first, false))
				os << ", ";
			os << id;
		}
		os << '\'';

		not_valid(os.str());
	}
}

void parser_imp::prolog()
{
	xml_decl();

	misc();

	if (m_lookahead == XMLToken::DocType)
	{
		doctypedecl();
		misc();
	}
	else if (m_validating)
		not_valid("document type declaration is missing");
}

void parser_imp::xml_decl()
{
	if (m_lookahead == XMLToken::XMLDecl)
	{
		encoding_type encoding = m_encoding;

		if (m_encoding == encoding_type::UTF8)
			m_source.top()->encoding(encoding_type::ASCII);

		match(XMLToken::XMLDecl);

		s(true);
		if (m_token != "version")
			not_well_formed("expected a version attribute in XML declaration");
		match(XMLToken::Name);
		eq();

		auto version = parse_version();

		m_version = version;

		if (m_version >= version_type{ 2, 0 } or m_version < version_type{ 1, 0 })
			not_well_formed("This library only supports XML version 1.0 or 1.1");

		m_source.top()->version(version);
		match(XMLToken::String);

		if (m_lookahead == XMLToken::Space)
		{
			s(true);

			if (m_token == "encoding")
			{
				match(XMLToken::Name);
				eq();

				if (iequals(m_token, "us-ascii"))
					encoding = encoding_type::ASCII;
				else if (iequals(m_token, "iso-8859-1"))
					encoding = encoding_type::ISO88591;
				else if (iequals(m_token, "utf-8"))
					encoding = encoding_type::UTF8;
				else if (iequals(m_token, "utf-16"))
				{
					if (m_source.top()->encoding() != encoding_type::UTF16LE and m_source.top()->encoding() != encoding_type::UTF16BE)
						not_well_formed("Inconsistent encoding attribute in XML declaration");
					encoding = m_source.top()->encoding();
				}
				else
					not_well_formed("Unsupported encoding value '" + m_token + "'");
				match(XMLToken::String);

				s();
			}

			if (m_token == "standalone")
			{
				match(XMLToken::Name);
				eq();
				if (m_token != "yes" and m_token != "no")
					not_well_formed("Invalid XML declaration, standalone value should be either yes or no");
				m_standalone = (m_token == "yes");
				match(XMLToken::String);
				s();
			}
		}

		m_encoding = encoding;
		m_source.top()->encoding(encoding);

		match(XMLToken::QuestionMark);
		match(XMLToken::GreaterThan);

		m_parser.xml_decl(m_encoding, m_standalone, m_version);
	}
	else
		m_encoding = m_source.top()->encoding();
}

void parser_imp::text_decl()
{
	if (m_lookahead == XMLToken::XMLDecl)
	{
		encoding_type encoding = m_source.top()->encoding();
		if (encoding == encoding_type::UTF8 and not m_source.top()->has_bom())
			m_source.top()->encoding(encoding_type::ISO88591);

		match(XMLToken::XMLDecl);

		s(true);

		if (m_token == "version")
		{
			match(XMLToken::Name);
			eq();

			auto version = parse_version();
			if (version > m_version)
				not_well_formed("Version mismatch between document and external entity");

			match(XMLToken::String);
			s(m_version == version_type{ 1, 0 });
		}

		if (m_token != "encoding")
		{
			if (m_version == version_type{ 1, 0 })
				not_well_formed("encoding attribute is mandatory in text declaration");
		}
		else
		{
			match(XMLToken::Name);
			eq();
			match(XMLToken::String);
			s();
		}

		m_source.top()->encoding(encoding);

		match(XMLToken::QuestionMark);
		match(XMLToken::GreaterThan);
	}
}

void parser_imp::misc()
{
	for (;;)
	{
		switch (m_lookahead)
		{
			case XMLToken::Space:
				s();
				continue;

			case XMLToken::Comment:
				comment();
				continue;

			case XMLToken::PI:
				pi();
				continue;
			default:;
		}

		break;
	}
}

void parser_imp::doctypedecl()
{
	match(XMLToken::DocType);

	m_has_dtd = true;

	s(true);

	auto name = m_token;
	match(XMLToken::Name);

	m_root_element = name;

	std::unique_ptr<data_source> dtd;

	if (m_lookahead == XMLToken::Space)
	{
		s(true);

		if (m_lookahead == XMLToken::Name)
		{
			std::string pubid, uri;

			if (m_token == "SYSTEM")
			{
				match(XMLToken::Name);
				s(true);

				uri = m_token;

				if (not is_valid_system_literal(uri))
					not_well_formed("invalid system literal");

				if (m_root_element == "html" and uri == "about:legacy-compat")
					m_is_html5 = true;
			}
			else if (m_token == "PUBLIC")
			{
				match(XMLToken::Name);
				s(true);

				pubid = m_token;
				match(XMLToken::String);

				// validate the public ID
				if (not is_valid_public_id(pubid))
					not_well_formed("Invalid public ID");

				s(true);
				uri = m_token;
			}
			else
				not_well_formed("Expected external id starting with either SYSTEM or PUBLIC");

			match(XMLToken::String);
			dtd.reset(get_data_source(pubid, uri));

			if (m_validating and not dtd)
				not_valid("Could not load DTD " + uri);

			m_parser.doctype_decl(m_root_element, pubid, uri);
		}

		s();
	}

	if (m_lookahead == XMLToken::OpenBracket)
	{
		match(XMLToken::OpenBracket);
		intsubset();
		match(XMLToken::CloseBracket);

		s();
	}

	// internal subset takes precedence over external subset, so
	// if the external subset is defined, include it here.
	if (dtd.get() != nullptr)
	{
		push_data_source(dtd.release(), false);

		m_external_subset = true;
		m_in_external_dtd = true;

		m_lookahead = get_next_token();

		text_decl();

		extsubset();

		match(XMLToken::Eof);

		pop_data_source();
		m_in_external_dtd = false;
	}

	match(XMLToken::GreaterThan);

	// test if all ndata references can be resolved

	for (const auto &e : m_general_entities)
	{
		if (e->is_parsed() == false and m_notations.count(e->get_ndata()) == 0)
			not_valid("Undefined NOTATION '" + e->get_ndata() + "'");
	}

	// and the notations in the doctype attlists
	for (const auto &element : m_doctype)
	{
		for (const auto &attr : element->get_attributes())
		{
			if (attr->get_type() != doctype::attribute_type::Notation)
				continue;

			for (auto &n : attr->get_enums())
			{
				if (m_notations.count(n) == 0)
					not_valid("Undefined NOTATION '" + n + "'");
			}
		}
	}
}

void parser_imp::pereference()
{
	const doctype::entity &e = get_parameter_entity(m_token);

	push_data_source(new parameter_entity_data_source(e.get_replacement(), e.get_path()), true);

	match(XMLToken::PEReference);
}

void parser_imp::intsubset()
{
	save_state state_intsubset(m_internal_subset, true);
	save_state state_allow_peref(m_allow_peref, true);

	for (;;)
	{
		switch (m_lookahead)
		{
			case XMLToken::Element:
			case XMLToken::AttList:
			case XMLToken::Entity:
			case XMLToken::Notation:
				markup_decl();
				continue;

			case XMLToken::PI:
				pi();
				continue;

			case XMLToken::Comment:
				comment();
				continue;

			case XMLToken::Space:
			case XMLToken::PEReference:
				declsep();
				continue;
			default:;
		}

		break;
	}
}

void parser_imp::declsep()
{
	save_state state_declsep(m_in_declsep, true);

	switch (m_lookahead)
	{
		case XMLToken::PEReference:
		{
			const doctype::entity &e = get_parameter_entity(m_token);

			match(XMLToken::PEReference);

			push_data_source(new parameter_entity_data_source(e.get_replacement(), e.get_path()), false);

			m_lookahead = get_next_token();
			extsubset();

			match(XMLToken::Eof);
			pop_data_source();

			break;
		}

		case XMLToken::Space:
			s();
			break;

		default:;
	}
}

void parser_imp::extsubset()
{
	save_state state_extsubset(m_external_subset, true);
	save_state state_allow_peref(m_allow_peref, true);

	for (;;)
	{
		switch (m_lookahead)
		{
			case XMLToken::Element:
			case XMLToken::AttList:
			case XMLToken::Entity:
			case XMLToken::Notation:
				markup_decl();
				continue;

			case XMLToken::IncludeIgnore:
				conditionalsect();
				continue;

			case XMLToken::PI:
				pi();
				continue;

			case XMLToken::Comment:
				comment();
				continue;

			case XMLToken::Space:
			case XMLToken::PEReference:
				declsep();
				continue;

			default:;
		}

		break;
	}
}

void parser_imp::conditionalsect()
{
	valid_nesting_validator check(*m_source.top());
	match(XMLToken::IncludeIgnore);

	s();

	bool include = false;

	if (m_token == "INCLUDE")
		include = true;
	else if (m_token == "IGNORE")
		include = false;
	else if (m_lookahead == XMLToken::Name)
		not_well_formed("Unexpected literal '" + m_token + "'");

	match(XMLToken::Name);

	check.check(*m_source.top());

	s();

	if (include)
	{
		match(XMLToken::OpenBracket);
		extsubset();
		match(XMLToken::CloseBracket);
		match(XMLToken::CloseBracket);
		check.check(*m_source.top());
		match(XMLToken::GreaterThan);
	}
	else
	{
		ignoresectcontents();
		check.check(*m_source.top());
		m_lookahead = get_next_token();
	}
}

void parser_imp::ignoresectcontents()
{
	// yet another tricky routine, skip

	int state = 0;
	bool done = false;

	while (not done)
	{
		char32_t ch = get_next_char();
		if (ch == 0)
			not_well_formed("runaway IGNORE section");

		switch (state)
		{
			case 0:
				if (ch == ']')
					state = 1;
				else if (ch == '<')
					state = 10;
				break;

			case 1:
				if (ch == ']')
					state = 2;
				else
				{
					retract();
					state = 0;
				}
				break;

			case 2:
				if (ch == '>')
					done = true;
				else if (ch != ']')
				{
					retract();
					state = 0;
				}
				break;

			case 10:
				if (ch == '!')
					state = 11;
				else
				{
					retract();
					state = 0;
				}
				break;

			case 11:
				if (ch == '[')
				{
					ignoresectcontents();
					state = 0;
				}
				else
				{
					retract();
					state = 0;
				}
				break;

			default:
				break;
		}
	}
}

void parser_imp::markup_decl()
{
	save_state state_allow_peref(m_allow_peref, m_external_subset);

	switch (m_lookahead)
	{
		case XMLToken::Element:
			element_decl();
			break;

		case XMLToken::AttList:
			attlist_decl();
			break;

		case XMLToken::Entity:
			entity_decl();
			break;

		case XMLToken::Notation:
			notation_decl();
			break;

		case XMLToken::PI:
			pi();
			break;

		case XMLToken::Comment:
			comment();
			break;

		case XMLToken::Space:
			s();
			break;

		default:;
	}
}

void parser_imp::element_decl()
{
	valid_nesting_validator check(*m_source.top());

	match(XMLToken::Element);
	s(true);

	std::string name = m_token;
	if (name.starts_with("xmlns:"))
		not_well_formed("Element names should not start with xmlns:");

	auto e = std::ranges::find_if(m_doctype,
		[name](const auto &e)
		{ return e->name() == name; });

	if (e == m_doctype.end())
		e = m_doctype.insert(m_doctype.end(), std::make_shared<doctype::element>(name, true, m_in_external_dtd));
	else if ((*e)->is_declared())
		not_valid("duplicate element declaration for element '" + name + "'");

	match(XMLToken::Name);
	s(true);

	contentspec(**e);
	s();

	check.check(*m_source.top());
	match(XMLToken::GreaterThan);
}

void parser_imp::contentspec(doctype::element &element)
{
	if (m_lookahead == XMLToken::Name)
	{
		if (m_token == "EMPTY")
			element.set_allowed(std::make_shared<doctype::content_spec_empty>());
		else if (m_token == "ANY")
			element.set_allowed(std::make_shared<doctype::content_spec_any>());
		else
			not_well_formed("Invalid element content specification");
		match(XMLToken::Name);
	}
	else
	{
		valid_nesting_validator check(*m_source.top());

		match(XMLToken::OpenParenthesis);

		doctype::content_spec_base_ptr allowed;

		s();

		bool mixed = false;
		bool more = false;

		if (m_lookahead == XMLToken::PCData) // Mixed
		{
			mixed = true;
			match(m_lookahead);

			s();

			std::set<std::string> seen;

			while (m_lookahead == XMLToken::Pipe)
			{
				more = true;

				match(XMLToken::Pipe);
				s();

				if (seen.count(m_token) > 0)
					not_valid("no duplicates allowed in mixed content for element declaration");
				seen.insert(m_token);

				match(XMLToken::Name);
				s();
			}

			auto choice = std::make_shared<doctype::content_spec_choice>(true);
			for (auto &c : seen)
				choice->add(std::make_shared<doctype::content_spec_element>(c));
			allowed = choice;
		}
		else // children
		{
			allowed = cp();

			s();

			if (m_lookahead == XMLToken::Comma)
			{
				auto seq = std::make_shared<doctype::content_spec_seq>(allowed);
				allowed = seq;

				more = true;
				do
				{
					match(m_lookahead);
					s();
					seq->add(cp());
					s();
				} while (m_lookahead == XMLToken::Comma);
			}
			else if (m_lookahead == XMLToken::Pipe)
			{
				auto choice = std::make_shared<doctype::content_spec_choice>(allowed, false);
				allowed = choice;

				more = true;
				do
				{
					match(m_lookahead);
					s();
					choice->add(cp());
					s();
				} while (m_lookahead == XMLToken::Pipe);
			}
		}

		s();

		check.check(*m_source.top());

		match(XMLToken::CloseParenthesis);

		if (m_lookahead == XMLToken::Asterisk)
		{
			allowed = std::make_shared<doctype::content_spec_repeated>(allowed, '*');
			match(XMLToken::Asterisk);
		}
		else if (more)
		{
			if (mixed)
			{
				allowed = std::make_shared<doctype::content_spec_repeated>(allowed, '*');
				match(XMLToken::Asterisk);
			}
			else if (m_lookahead == XMLToken::Plus)
			{
				allowed = std::make_shared<doctype::content_spec_repeated>(allowed, '+');
				match(XMLToken::Plus);
			}
			else if (m_lookahead == XMLToken::QuestionMark)
			{
				allowed = std::make_shared<doctype::content_spec_repeated>(allowed, '?');
				match(XMLToken::QuestionMark);
			}
		}

		element.set_allowed(allowed);
	}
}

doctype::content_spec_base_ptr parser_imp::cp()
{
	doctype::content_spec_base_ptr result;

	if (m_lookahead == XMLToken::OpenParenthesis)
	{
		valid_nesting_validator check(*m_source.top());

		match(XMLToken::OpenParenthesis);

		s();
		result = cp();
		s();
		if (m_lookahead == XMLToken::Comma)
		{
			auto seq = std::make_shared<doctype::content_spec_seq>(result);
			result = seq;

			do
			{
				match(m_lookahead);
				s();
				seq->add(cp());
				s();
			} while (m_lookahead == XMLToken::Comma);
		}
		else if (m_lookahead == XMLToken::Pipe)
		{
			auto choice = std::make_shared<doctype::content_spec_choice>(result, false);
			result = choice;

			do
			{
				match(m_lookahead);
				s();
				choice->add(cp());
				s();
			} while (m_lookahead == XMLToken::Pipe);
		}

		s();
		check.check(*m_source.top());
		match(XMLToken::CloseParenthesis);
	}
	else
	{
		std::string name = m_token;
		match(XMLToken::Name);

		result = std::make_shared<doctype::content_spec_element>(name);
	}

	switch (m_lookahead)
	{
		case XMLToken::Asterisk:
			result = std::make_shared<doctype::content_spec_repeated>(result, '*');
			match(XMLToken::Asterisk);
			break;
		case XMLToken::Plus:
			result = std::make_shared<doctype::content_spec_repeated>(result, '+');
			match(XMLToken::Plus);
			break;
		case XMLToken::QuestionMark:
			result = std::make_shared<doctype::content_spec_repeated>(result, '?');
			match(XMLToken::QuestionMark);
			break;
		default:;
	}

	return result;
}

void parser_imp::entity_decl()
{
	match(XMLToken::Entity);
	s(true);

	if (m_lookahead == XMLToken::Percent) // PEDecl
		parameter_entity_decl();
	else
		general_entity_decl();
}

void parser_imp::parameter_entity_decl()
{
	match(XMLToken::Percent);
	s(true);

	std::string name = m_token;
	match(XMLToken::Name);

	if (m_validating_ns and name.find(':') != std::string::npos)
		not_well_formed("Entity names should not contain a colon");
	if (name.starts_with("xmlns:"))
		not_well_formed("Entity names should not start with xmlns:");

	s(true);

	std::string path;
	std::string value;

	{
		// PEDef is either a EntityValue...
		if (m_lookahead == XMLToken::String)
		{
			value = m_token;
			match(XMLToken::String);

			parse_parameter_entity_declaration(value);
		}
		else // ... or an external id
		{
			std::tie(path, value) = read_external_id();
			match(XMLToken::String);
		}

		s();
	}

	match(XMLToken::GreaterThan);

	if (std::ranges::find_if(m_parameter_entities,
			[name](const auto &e)
			{ return e->name() == name; }) == m_parameter_entities.end())
	{
		m_parameter_entities.push_back(std::make_shared<doctype::parameter_entity>(name, value, path));
	}
}

void parser_imp::general_entity_decl()
{
	std::string name = m_token;
	match(XMLToken::Name);
	s(true);

	if (m_validating_ns and name.find(':') != std::string::npos)
		not_well_formed("Entity names should not contain a colon");
	if (name.starts_with("xmlns:"))
		not_well_formed("Entity names should not start with xmlns:");

	std::string value, ndata;
	bool external = false;
	bool parsed = true;

	if (m_lookahead == XMLToken::String)
	{
		value = m_token;
		match(XMLToken::String);

		parse_general_entity_declaration(value);
	}
	else // ... or an ExternalID
	{
		std::tie(std::ignore, value) = read_external_id();
		match(XMLToken::String);
		external = true;

		if (m_lookahead == XMLToken::Space)
		{
			s(true);
			if (m_lookahead == XMLToken::Name and m_token == "NDATA")
			{
				match(XMLToken::Name);
				s(true);

				parsed = false;
				ndata = m_token;

				match(XMLToken::Name);
			}
		}
	}

	s();

	match(XMLToken::GreaterThan);

	if (std::ranges::find_if(m_general_entities,
			[name](const auto &e)
			{ return e->name() == name; }) == m_general_entities.end())
	{
		m_general_entities.push_back(std::make_shared<doctype::general_entity>(name, value, external, parsed));

		if (not parsed)
			m_general_entities.back()->set_ndata(ndata);

		if (m_in_external_dtd)
			m_general_entities.back()->set_externally_defined(true);
	}
}

void parser_imp::attlist_decl()
{
	match(XMLToken::AttList);
	s(true);
	std::string element = m_token;
	match(XMLToken::Name);

	auto dte = std::ranges::find_if(m_doctype,
		[element](const auto &e)
		{ return e->name() == element; });

	if (dte == m_doctype.end())
		dte = m_doctype.insert(m_doctype.end(), std::make_shared<doctype::element>(element, false, m_in_external_dtd));

	// attribute defaults

	while (m_lookahead == XMLToken::Space)
	{
		s(true);

		if (m_lookahead != XMLToken::Name)
			break;

		std::string name = m_token;
		match(XMLToken::Name);
		s(true);

		doctype::attribute_ptr attribute;

		// att type: several possibilities:
		if (m_lookahead == XMLToken::OpenParenthesis) // enumeration
		{
			std::vector<std::string> enums;

			match(m_lookahead);

			s();

			enums.push_back(m_token);
			if (m_lookahead == XMLToken::Name)
				match(XMLToken::Name);
			else
				match(XMLToken::NMToken);

			s();

			while (m_lookahead == XMLToken::Pipe)
			{
				match(XMLToken::Pipe);

				s();

				if (std::ranges::find(enums, m_token) != enums.end())
					not_valid("Duplicate token in enumerated attribute declaration ('" + m_token + "')");

				enums.push_back(m_token);
				if (m_lookahead == XMLToken::Name)
					match(XMLToken::Name);
				else
					match(XMLToken::NMToken);

				s();
			}

			s();

			match(XMLToken::CloseParenthesis);

			attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::Enumerated, enums);
		}
		else
		{
			std::string type = m_token;
			match(XMLToken::Name);

			std::vector<std::string> notations;

			if (type == "CDATA")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::CDATA);
			else if (type == "ID")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::ID);
			else if (type == "IDREF")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::IDREF);
			else if (type == "IDREFS")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::IDREFS);
			else if (type == "ENTITY")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::ENTITY);
			else if (type == "ENTITIES")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::ENTITIES);
			else if (type == "NMTOKEN")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::NMTOKEN);
			else if (type == "NMTOKENS")
				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::NMTOKENS);
			else if (type == "NOTATION")
			{
				s(true);
				match(XMLToken::OpenParenthesis);
				s();

				notations.push_back(m_token);
				match(XMLToken::Name);

				s();

				while (m_lookahead == XMLToken::Pipe)
				{
					match(XMLToken::Pipe);

					s();

					if (std::ranges::find(notations, m_token) != notations.end())
						not_valid("Duplicate token in enumerated attribute declaration ('" + m_token + "')");
					notations.push_back(m_token);
					match(XMLToken::Name);

					s();
				}

				s();

				match(XMLToken::CloseParenthesis);

				attribute = std::make_shared<doctype::attribute>(name, doctype::attribute_type::Notation, notations);
			}
			else
				not_well_formed("invalid attribute type");
		}

		// att def

		s(true);

		switch (m_lookahead)
		{
			case XMLToken::Required:
				match(m_lookahead);
				attribute->set_default(doctype::attribute_default::Required, "");
				break;

			case XMLToken::Implied:
				match(m_lookahead);
				attribute->set_default(doctype::attribute_default::Implied, "");
				break;

			case XMLToken::Fixed:
			{
				match(m_lookahead);
				if (attribute->get_type() == doctype::attribute_type::ID)
					not_valid("the default declaration for an ID attribute declaration should be #IMPLIED or #REQUIRED");

				s(true);

				std::string token_value = m_token;
				normalize_attribute_value(token_value, attribute->get_type() == doctype::attribute_type::CDATA);
				if (not token_value.empty() and not attribute->validate_value(token_value, m_general_entities))
				{
					not_valid(std::format("default value '{}' for attribute '{}' is not valid", token_value, name));
				}

				attribute->set_default(doctype::attribute_default::Fixed, token_value);
				match(XMLToken::String);
				break;
			}

			default:
			{
				if (attribute->get_type() == doctype::attribute_type::ID)
					not_valid("the default declaration for an ID attribute declaration should be #IMPLIED or #REQUIRED");

				if (m_standalone)
					not_valid("Document cannot be standalone since there is a default value for an attribute");

				std::string token_value = m_token;
				normalize_attribute_value(token_value, attribute->get_type() == doctype::attribute_type::CDATA);
				collapse_spaces(token_value);
				if (not token_value.empty() and not attribute->validate_value(token_value, m_general_entities))
				{
					not_valid(std::format("default value '{}' for attribute '{}' is not valid", token_value, name));
				}
				attribute->set_default(doctype::attribute_default::None, token_value);
				match(XMLToken::String);
				break;
			}
		}

		if (attribute->get_type() == doctype::attribute_type::ID)
		{
			const doctype::attribute_list &atts = (*dte)->get_attributes();
			if (std::ranges::find_if(atts,
					[](const auto &a)
					{ return a->get_type() == doctype::attribute_type::ID; }) != atts.end())
				not_valid("only one attribute per element can have the ID type");
		}

		attribute->set_external(m_in_external_dtd);
		// attribute->version(m_version);
		(*dte)->add_attribute(attribute);
	}

	match(XMLToken::GreaterThan);
}

void parser_imp::notation_decl()
{
	match(XMLToken::Notation);
	s(true);

	std::string name = m_token, pubid, sysid;

	if (m_validating_ns and name.find(':') != std::string::npos)
		not_well_formed("Notation names should not contain a colon");

	if (m_notations.count(name) > 0)
		not_valid("notation names should be unique");
	m_notations.insert(name);

	match(XMLToken::Name);
	s(true);

	if (m_token == "SYSTEM")
	{
		match(XMLToken::Name);
		s(true);

		sysid = m_token;
		match(XMLToken::String);

		if (not is_valid_system_literal(sysid))
			not_well_formed("invalid system literal");
	}
	else if (m_token == "PUBLIC")
	{
		match(XMLToken::Name);
		s(true);

		pubid = m_token;
		match(XMLToken::String);

		// validate the public ID
		if (not is_valid_public_id(pubid))
			not_well_formed("Invalid public ID");

		s();

		if (m_lookahead == XMLToken::String)
		{
			sysid = m_token;
			match(XMLToken::String);
		}
	}
	else
		not_well_formed("Expected either SYSTEM or PUBLIC");

	s();

	match(XMLToken::GreaterThan);

	collapse_spaces(sysid);

	for (char &ch : pubid)
	{
		if (ch == '\t' or ch == '\n')
			ch = ' ';
	}

	collapse_spaces(pubid);

	m_parser.notation_decl(name, sysid, pubid);
}

data_source *parser_imp::get_data_source(std::string_view pubid, std::string uri)
{
	data_source *result = nullptr;

	auto is = m_parser.external_entity_ref(m_source.top()->base(), pubid, uri);
	if (is != nullptr)
	{
		result = new istream_data_source(is.release());

		std::string::size_type s = uri.rfind('/');
		if (s == std::string::npos)
			result->base(m_source.top()->base());
		else
		{
			uri.erase(s, std::string::npos);

			if (is_absolute_path(uri))
				result->base(std::move(uri));
			else
				result->base(m_source.top()->base() + '/' + uri);
		}
	}

	return result;
}

std::tuple<std::string, std::string> parser_imp::read_external_id()
{
	std::string result;
	std::string path;

	std::string pubid, uri;

	if (m_token == "SYSTEM")
	{
		match(XMLToken::Name);
		s(true);

		uri = m_token;

		if (not is_valid_system_literal(uri))
			not_well_formed("invalid system literal");
	}
	else if (m_token == "PUBLIC")
	{
		match(XMLToken::Name);
		s(true);

		pubid = m_token;
		match(XMLToken::String);

		// validate the public ID
		if (not is_valid_public_id(pubid))
			not_well_formed("Invalid public ID");

		s(true);
		uri = m_token;
	}
	else
		not_well_formed("Expected external id starting with either SYSTEM or PUBLIC");

	std::unique_ptr<data_source> data(get_data_source(pubid, uri));

	if (data)
	{
		push_data_source(data.release(), false);

		path = m_source.top()->base();

		m_lookahead = get_next_token();

		text_decl();

		if (m_lookahead != XMLToken::Eof)
		{
			result = m_token;

			while (m_buffer_ptr > m_buffer.begin())
				append(result, *--m_buffer_ptr);

			while (char32_t ch = m_source.top()->get_next_char())
				append(result, ch);
		}

		pop_data_source();
	}

	return std::make_tuple(path, result);
}

void parser_imp::parse_parameter_entity_declaration(std::string &s)
{
	std::string result;

	int state = 0;
	char32_t charref = 0;
	std::string name;
	int open = 0;

	for (char32_t c : s)
	{
		switch (state)
		{
			case 0:
				if (c == '&')
					state = 1;
				else if (c == '%')
				{
					if (m_allow_peref)
					{
						name.clear();
						state = 20;
					}
					else
						not_well_formed("parameter entities may not occur in declarations that are not in an external subset");
				}
				else if (c == '<')
				{
					++open;
					append(result, c);
				}
				else if (c == '>')
				{
					--open;
					append(result, c);
				}
				else if (not is_char(c))
					not_well_formed("Invalid character in entity value");
				else
					append(result, c);
				break;

			case 1:
				if (c == '#')
					state = 2;
				else
				{
					result += '&';
					append(result, c);
					state = 0;
				}
				break;

			case 2:
				if (c == 'x')
					state = 4;
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = 3;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 3:
				if (c >= '0' and c <= '9')
					charref = charref * 10 + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: " + to_hex(charref) + '\'');

					append(result, charref);
					state = 0;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 4:
				if (c >= 'a' and c <= 'f')
				{
					charref = c - 'a' + 10;
					state = 5;
				}
				else if (c >= 'A' and c <= 'F')
				{
					charref = c - 'A' + 10;
					state = 5;
				}
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = 5;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 5:
				if (c >= 'a' and c <= 'f')
					charref = (charref << 4) + (c - 'a' + 10);
				else if (c >= 'A' and c <= 'F')
					charref = (charref << 4) + (c - 'A' + 10);
				else if (c >= '0' and c <= '9')
					charref = (charref << 4) + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: '" + to_hex(charref) + '\'');

					append(result, charref);
					state = 0;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 20:
				if (c == ';')
				{
					const doctype::entity &e = get_parameter_entity(name);
					result += e.get_replacement();
					state = 0;
				}
				else if (is_name_char(c))
					append(name, c);
				else
					not_well_formed("invalid parameter entity reference");
				break;

			default:
				assert(false);
				not_well_formed("invalid state");
		}
	}

	if (state != 0)
		not_well_formed("invalid reference");

	if (open != 0)
		not_valid("invalid reference");

	swap(s, result);
}

// parse out the general and parameter entity references in a value std::string
// for a general entity reference which is about to be stored.
void parser_imp::parse_general_entity_declaration(std::string &s)
{
	std::string result;

	int state = 0;
	char32_t charref = 0;
	std::string name;

	auto sp = s.cbegin();
	auto se = s.cend();

	while (sp < se)
	{
		char32_t c = pop_front_char(sp, se);

		switch (state)
		{
			case 0:
				if (c == '&')
					state = 1;
				else if (c == '%')
				{
					if (m_allow_peref)
					{
						name.clear();
						state = 20;
					}
					else
						not_well_formed("parameter entities may not occur in declarations that are not in an external subset");
				}
				else if (not is_char(c))
					not_well_formed("Invalid character in entity value");
				else
					append(result, c);
				break;

			case 1:
				if (c == '#')
					state = 2;
				else if (is_name_start_char(c))
				{
					name.clear();
					append(name, c);
					state = 10;
				}
				break;

			case 2:
				if (c == 'x')
					state = 4;
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = 3;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 3:
				if (c >= '0' and c <= '9')
					charref = charref * 10 + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: '" + to_hex(charref) + '\'');

					append(result, charref);
					state = 0;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 4:
				if (c >= 'a' and c <= 'f')
				{
					charref = c - 'a' + 10;
					state = 5;
				}
				else if (c >= 'A' and c <= 'F')
				{
					charref = c - 'A' + 10;
					state = 5;
				}
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = 5;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 5:
				if (c >= 'a' and c <= 'f')
					charref = (charref << 4) + (c - 'a' + 10);
				else if (c >= 'A' and c <= 'F')
					charref = (charref << 4) + (c - 'A' + 10);
				else if (c >= '0' and c <= '9')
					charref = (charref << 4) + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: '" + to_hex(charref) + '\'');

					append(result, charref);
					state = 0;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case 10:
				if (c == ';')
				{
					result += '&';
					result += name;
					result += ';';

					state = 0;
				}
				else if (is_name_char(c))
					append(name, c);
				else
					not_well_formed("invalid entity reference");
				break;

			case 20:
				if (c == ';')
				{
					const doctype::entity &e = get_parameter_entity(name);
					result += e.get_replacement();
					state = 0;
				}
				else if (is_name_char(c))
					append(name, c);
				else
					not_well_formed("invalid parameter entity reference");
				break;

			default:
				assert(false);
				not_well_formed("invalid state");
		}
	}

	if (state != 0)
		not_well_formed("invalid reference");

	swap(s, result);
}

std::string parser_imp::normalize_attribute_value()
{
	std::string result;

	char32_t charref = 0;
	std::string name;

	enum State
	{
		state_Start,
		state_ReferenceStart,
		state_CharReferenceStart,
		state_HexCharReference,
		state_HexCharReference2,
		state_DecCharReference,
		state_EntityReference,

	} state = state_Start;

	for (;;)
	{
		char32_t c = get_next_char();

		if (c == 0)
			break;

		if (c == '<')
			not_well_formed("Attribute values may not contain '<' character");

		switch (state)
		{
			case state_Start:
				if (c == ' ' or c == '\t' or c == '\r' or c == '\n')
					result += ' ';
				else if (c == '&')
					state = state_ReferenceStart;
				else
					append(result, c);
				break;

			case state_ReferenceStart:
				if (c == '#')
					state = state_CharReferenceStart;
				else if (is_name_start_char(c))
				{
					name.clear();
					append(name, c);
					state = state_EntityReference;
				}
				else
					not_well_formed("invalid reference found in attribute value");
				break;

			case state_CharReferenceStart:
				if (c == 'x')
					state = state_HexCharReference;
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = state_DecCharReference;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_DecCharReference:
				if (c >= '0' and c <= '9')
					charref = charref * 10 + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: '" + to_hex(charref) + '\'');

					append(result, charref);
					state = state_Start;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_HexCharReference:
				if (c >= 'a' and c <= 'f')
				{
					charref = c - 'a' + 10;
					state = state_HexCharReference2;
				}
				else if (c >= 'A' and c <= 'F')
				{
					charref = c - 'A' + 10;
					state = state_HexCharReference2;
				}
				else if (c >= '0' and c <= '9')
				{
					charref = c - '0';
					state = state_HexCharReference2;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_HexCharReference2:
				if (c >= 'a' and c <= 'f')
					charref = (charref << 4) + (c - 'a' + 10);
				else if (c >= 'A' and c <= 'F')
					charref = (charref << 4) + (c - 'A' + 10);
				else if (c >= '0' and c <= '9')
					charref = (charref << 4) + (c - '0');
				else if (c == ';')
				{
					if (not is_referrable_char(charref))
						not_well_formed("Illegal character referenced: '" + to_hex(charref) + '\'');

					append(result, charref);
					state = state_Start;
				}
				else
					not_well_formed("invalid character reference");
				break;

			case state_EntityReference:
				if (c == ';')
				{
					if (std::ranges::find(m_entities_on_stack, name) != m_entities_on_stack.end())
						not_well_formed("infinite recursion in nested entity references");

					m_entities_on_stack.push_back(name);

					const doctype::entity &e = get_general_entity(name);

					if (e.is_external())
						not_well_formed("attribute value may not contain external entity reference");

					if (e.is_externally_defined() and m_standalone)
						not_well_formed("document marked as standalone but an external entity is referenced");

					push_data_source(new entity_data_source(e.get_replacement(), m_source.top()->base()), false);

					std::string replacement = normalize_attribute_value();
					result += replacement;

					state = state_Start;

					m_entities_on_stack.pop_back();
				}
				else if (is_name_char(c))
					append(name, c);
				else
					not_well_formed("invalid entity reference");
				break;

			default:
				assert(false);
				not_well_formed("invalid state");
		}
	}

	if (state != state_Start)
		not_well_formed("invalid reference");

	m_source.pop();

	return result;
}

void parser_imp::collapse_spaces(std::string &s)
{
	auto i = s.begin(), o = s.begin();
	;
	bool space = true;

	while (i != s.end())
	{
		if (*i == ' ')
		{
			if (not space)
				*o++ = ' ';
			++i;
			space = true;
		}
		else
		{
			*o++ = *i++;
			space = false;
		}
	}

	if (space and o != s.begin())
		--o;

	s.erase(o, s.end());
}

void parser_imp::element(doctype::validator &valid)
{
	save_state in_content(m_in_content, false);

	match(XMLToken::STag);
	std::string name = m_token;
	match(XMLToken::Name);

	if (not valid.allow(name))
		not_valid("element '" + name + "' not expected at this position");

	auto dte = get_element(name);

	if (m_has_dtd and dte == nullptr and m_validating)
		not_valid("Element '" + name + "' is not defined in DTD");

	doctype::validator sub_valid(dte);

	std::vector<parser::attr> attrs;

	ns_state ns(this);
	std::set<std::string> seen;

	for (;;)
	{
		if (m_lookahead != XMLToken::Space)
			break;

		s(true);

		if (m_lookahead != XMLToken::Name)
			break;

		std::string attr_name = m_token;
		match(XMLToken::Name);

		if (seen.count(attr_name) > 0)
			not_well_formed("multiple values for attribute '" + attr_name + "'");
		seen.insert(attr_name);

		eq();

		doctype::attribute_ptr dta;
		if (dte != nullptr)
			dta = dte->get_attribute(attr_name);
		if (dta == nullptr and not m_validating and attr_name == "xml:space")
			dta = m_xmlSpaceAttr;

		if (dta == nullptr and m_validating)
			not_valid("undeclared attribute '" + attr_name + "'");

		std::string attr_value = normalize_attribute_value(m_token, dta == nullptr or dta->get_type() == doctype::attribute_type::CDATA);
		match(XMLToken::String);

		if (m_validating and
			dta != nullptr and
			dta->get_default_type() == doctype::attribute_default::Fixed and
			attr_value != std::get<1>(dta->get_default()))
		{
			not_valid("invalid value specified for fixed attribute");
		}

		// had a crash suddenly here deep down in starts_with...
		if (attr_name == "xmlns" or attr_name.starts_with("xmlns:")) // namespace support
		{
			if (not((m_version > version_type{ 1, 0 } and attr_value.empty()) or is_valid_url(attr_value)))
				not_well_formed("Not a valid namespace URI: " + attr_value);

			if (not(m_version > version_type{ 1, 0 } and attr_value.empty()) and ns.is_known_uri(attr_value))
				not_well_formed("This uri is repeated: " + attr_value);

			if (attr_value == "http://www.w3.org/XML/1998/namespace" or attr_value == "http://www.w3.org/2000/xmlns/")
				not_well_formed("The xml namespace is reserved");

			if (attr_name.length() == 5)
			{
				ns.default_ns(attr_value);
				m_parser.start_namespace_decl("", attr_value);
			}
			else if (attr_name.length() == 6)
				not_well_formed("Invalid xmlns: ");
			else
			{
				std::string prefix = attr_name.substr(6);

				if (iequals(prefix, "xml") or iequals(prefix, "xmlns"))
					not_well_formed(prefix + " is a preserved prefix");

				if (m_version > version_type{ 1, 0 } and attr_value.empty())
					ns.unbind(prefix);
				else
				{
					ns.bind(prefix, attr_value);
					m_parser.start_namespace_decl(prefix, attr_value);
				}
			}

			// if (not attr_value.empty())
			// 	ns.m_known_uris.insert(attr_value);
		}
		else
		{
			bool id = (attr_name == "xml:id");

			if (dta != nullptr)
			{
				std::string v(attr_value);

				if (not dta->validate_value(attr_value, m_general_entities))
				{
					if (dta == m_xmlSpaceAttr)
						not_well_formed(std::format("invalid value ('{}') for attribute {}", attr_value, attr_name));
					else
						not_valid(std::format("invalid value ('{}') for attribute {}", attr_value, attr_name));
				}

				if (m_validating and m_standalone and dta->is_external() and v != attr_value)
					not_valid("attribute value modified as a result of an external defined attlist declaration, which is not valid in a standalone document");

				if (dta->get_type() == doctype::attribute_type::ID)
				{
					id = true;

					if (m_validating_ns and attr_value.find(':') != std::string::npos)
						not_valid("ID attribute value should not contain a colon");

					if (m_ids.count(attr_value) > 0)
					{
						not_valid(std::format("attribute value ('{}') for attribute '{}' is not unique", attr_value, attr_name));
					}

					m_ids.insert(attr_value);

					if (m_unresolved_ids.count(attr_value) > 0)
						m_unresolved_ids.erase(attr_value);
				}
				else if (dta->get_type() == doctype::attribute_type::IDREF)
				{
					if (attr_value.empty())
						not_valid("attribute value for attribute '" + attr_name + "' may not be empty");

					if (not m_ids.count(attr_value))
						m_unresolved_ids.insert(attr_value);
				}
				else if (dta->get_type() == doctype::attribute_type::IDREFS)
				{
					if (attr_value.empty())
						not_valid("attribute value for attribute '" + attr_name + "' may not be empty");

					std::string::size_type b = 0, e = attr_value.find(' ');
					while (e != std::string::npos)
					{
						if (e - b > 0)
						{
							std::string idv = attr_value.substr(b, e);
							if (not m_ids.count(idv))
								m_unresolved_ids.insert(idv);
						}
						b = e + 1;
						e = attr_value.find(' ', b);
					}

					if (b != std::string::npos and b < attr_value.length())
					{
						std::string idv = attr_value.substr(b);
						if (not m_ids.count(idv))
							m_unresolved_ids.insert(idv);
					}
				}
			}

			parser::attr a;
			a.m_name = attr_name;
			a.m_value = attr_value;
			a.m_id = id;

			if (m_ns != nullptr and dta == nullptr)
			{
				std::string::size_type d = attr_name.find(':');
				if (d != std::string::npos)
				{
					if (attr_name.find(':', d + 1) != std::string::npos)
						not_well_formed("Multiple colons in attribute name");

					auto prefix = attr_name.substr(0, d);
					if (not iequals(prefix, "xml"))
					{
						std::string nsv = m_ns->ns_for_prefix(prefix);

						if (nsv.empty())
							not_well_formed("Unbound attribute prefix");

						a.m_ns = nsv;
						a.m_name = attr_name.substr(d + 1);
					}
				}
			}

			attrs.push_back(a);
		}
	}

	if (dte == nullptr)
	{
		if (name[0] == ':')
			not_well_formed("Element name should not start with colon");

		auto cp = name.find(':');
		if (cp != std::string::npos)
		{
			auto prefix = name.substr(0, cp);
			if (not ns.is_known_prefix(prefix))
				not_well_formed("Unknown prefix for element " + name);
		}
	}
	else // add missing attributes
	{
		for (const auto &dta : dte->get_attributes())
		{
			std::string attr_name = dta->name();

			auto ai = std::ranges::find_if(attrs,
				[attr_name](auto &a)
				{ return a.m_name == attr_name; });

			doctype::attribute_default defType;
			std::string defValue;

			std::tie(defType, defValue) = dta->get_default();

			if (defType == doctype::attribute_default::Required)
			{
				if (ai == attrs.end())
					not_valid(std::format("missing #REQUIRED attribute '{}' for element '{}'", attr_name, name));
			}
			else if (not defValue.empty() and ai == attrs.end())
			{
				if (m_validating and m_standalone and dta->is_external())
					not_valid("default value for attribute defined in external declaration which is not allowed in a standalone document");

				parser::attr def_attr;
				def_attr.m_name = attr_name;
				def_attr.m_value = normalize_attribute_value(defValue, dta->get_type() == doctype::attribute_type::CDATA);
				def_attr.m_id = false;

				if (m_ns != nullptr)
				{
					std::string::size_type d = attr_name.find(':');
					if (d != std::string::npos)
					{
						std::string nsv = m_ns->ns_for_prefix(attr_name.substr(0, d));

						if (not nsv.empty())
						{
							def_attr.m_ns = nsv;
							def_attr.m_name = attr_name.substr(d + 1);
						}
					}
				}

				attrs.push_back(def_attr);
			}
		}
	}

	// now find out the namespace we're supposed to pass
	std::string uri, raw(name);

	std::string::size_type c = name.find(':');
	if (c != std::string::npos and c > 0)
	{
		uri = ns.ns_for_prefix(name.substr(0, c));
		name.erase(0, c + 1);
	}
	else
		uri = ns.default_ns();

	// sort the attributes
	std::ranges::sort(attrs, [](auto &a, auto &b)
		{ return a.m_name < b.m_name; });

	if (m_lookahead == XMLToken::Slash)
	{
		match(XMLToken::Slash);
		m_parser.start_element(name, uri, attrs);
		m_parser.end_element(name, uri);
	}
	else
	{
		m_parser.start_element(name, uri, attrs);

		m_in_content = true;
		match(XMLToken::GreaterThan);

		if (m_lookahead != XMLToken::ETag)
			content(sub_valid);

		m_in_content = false;

		match(XMLToken::ETag);

		if (m_token != raw)
			not_well_formed("end tag does not match start tag");

		match(XMLToken::Name);

		s();

		m_parser.end_element(name, uri);
	}

	in_content.reset();
	match(XMLToken::GreaterThan);

	if (m_validating and dte != nullptr and not sub_valid.done())
		not_valid("missing child elements for element '" + dte->name() + "'");
}

void parser_imp::content(doctype::validator &valid)
{
	if (valid.get_content_spec() == doctype::content_spec_type::Empty and m_lookahead != XMLToken::ETag)
		not_valid("Content is not allowed in an element declared to be EMPTY");

	do
	{
		switch (m_lookahead)
		{
			case XMLToken::Content:
			case XMLToken::Space:
				if (valid.get_content_spec() == doctype::content_spec_type::Empty)
					not_valid("character data not allowed in EMPTY element");
				else if (valid.get_content_spec() == doctype::content_spec_type::Children and m_lookahead == XMLToken::Content)
					not_valid("character data '" + m_token + "' not allowed in element");
				m_parser.character_data(m_token);
				match(m_lookahead);
				break;

			case XMLToken::CharRef:
				if (valid.get_content_spec() == doctype::content_spec_type::Empty)
					not_valid("data not allowed in EMPTY element");
				else if (valid.get_content_spec() == doctype::content_spec_type::Children and is_space(m_token))
					not_valid("Element may not contain reference to space");
				m_parser.character_data(m_token);
				match(m_lookahead);
				break;

			case XMLToken::Reference:
			{
				if (std::ranges::find(m_entities_on_stack, m_token) != m_entities_on_stack.end())
					not_well_formed("infinite recursion of entity references");

				m_entities_on_stack.push_back(m_token);

				const doctype::entity &e = get_general_entity(m_token);

				if (e.is_externally_defined() and m_standalone)
					not_well_formed("document marked as standalone but an external entity is referenced");

				if (not e.is_parsed())
					not_well_formed("content has a general entity reference to an unparsed entity");

				push_data_source(new entity_data_source(e.get_replacement(), m_source.top()->base()), false);

				m_lookahead = get_next_content();

				save_state in_external_dtd(m_in_external_dtd, e.is_externally_defined());

				// a children production may not contain references to spaces
				if (m_lookahead == XMLToken::Space and valid.get_content_spec() == doctype::content_spec_type::Children)
				{
					auto space = m_token;
					match(m_lookahead);

					if (m_lookahead == XMLToken::Eof)
						not_valid("Element may not contain reference to space");
					m_parser.character_data(space);
				}

				if (m_lookahead != XMLToken::Eof)
					content(valid);

				if (m_lookahead != XMLToken::Eof)
					not_well_formed("entity reference should be a valid content production");

				pop_data_source();

				match(XMLToken::Reference);

				m_entities_on_stack.pop_back();
				break;
			}

			case XMLToken::STag:
				element(valid);
				break;

			case XMLToken::PI:
				pi();
				break;

			case XMLToken::Comment:
				comment();
				break;

			case XMLToken::CDSect:
				if (valid.get_content_spec() != doctype::content_spec_type::Mixed and valid.get_content_spec() != doctype::content_spec_type::Any)
					not_valid("character data '" + m_token + "' not allowed in element");

				m_parser.start_cdata_section();
				m_parser.character_data(m_token);

				if (is_space(m_token) and valid.get_content_spec() == doctype::content_spec_type::Children)
					not_valid("Element may not contain CDATA section containing only space");

				m_parser.end_cdata_section();

				match(XMLToken::CDSect);
				break;

			default:
				match(XMLToken::Content); // will fail and report error
		}
	} while (m_lookahead != XMLToken::ETag and m_lookahead != XMLToken::Eof);
}

void parser_imp::comment()
{
	save_state in_content(m_in_content, false);

	// m_lookahead == XMLToken::Comment
	// read characters until we reach -->
	// check all characters in between for validity

	enum
	{
		state_Start,
		state_FirstHyphenSeen,
		state_SecondHyphenSeen,
		state_CommentClosed
	} state = state_Start;

	m_token.clear();

	while (state != state_CommentClosed)
	{
		char32_t ch = get_next_char();

		if (ch == 0)
			not_well_formed("runaway comment");
		if (not is_char(ch))
			not_well_formed("illegal character in content: '" + to_hex(ch) + '\'');

		switch (state)
		{
			case state_Start:
				if (ch == '-')
					state = state_FirstHyphenSeen;
				break;

			case state_FirstHyphenSeen:
				if (ch == '-')
					state = state_SecondHyphenSeen;
				else
					state = state_Start;
				break;

			case state_SecondHyphenSeen:
				if (ch == '>')
					state = state_CommentClosed;
				else
					not_well_formed("double hyphen found in comment");
				break;

			case state_CommentClosed:
				assert(false);
		}
	}

	assert(m_token.length() >= 3);
	m_token.erase(m_token.end() - 3, m_token.end());
	m_parser.comment(m_token);

	in_content.reset();
	match(XMLToken::Comment);
}

void parser_imp::pi()
{
	save_state in_content(m_in_content, false);

	// m_lookahead == XMLToken::PI
	// read characters until we reach -->
	// check all characters in between for validity

	std::string pi_target = m_token.substr(2);

	if (pi_target.empty())
		not_well_formed("processing instruction target missing");

	if (m_validating_ns and pi_target.find(':') != std::string::npos)
		not_well_formed("processing instruction name should not contain a colon");

	// we treat the xml processing instruction separately.
	if (m_token.substr(2) == "xml")
		not_well_formed("xml declaration are only valid as the start of the file");
	else if (iequals(pi_target, "xml"))
		not_well_formed("<?XML is neither an XML declaration nor a legal processing instruction target");
	// else if (pi_target.find(':') != std::string::npos)
	// 	not_well_formed("processing instruction target contains colon");

	enum
	{
		state_Start,
		state_DataStart,
		state_Data,
		state_QuestionMarkSeen,
		state_PIClosed
	} state = state_Start;

	m_token.clear();

	while (state != state_PIClosed)
	{
		char32_t ch = get_next_char();

		if (ch == 0)
			not_well_formed("runaway processing instruction");
		if (not is_char(ch))
			not_well_formed("illegal character in processing instruction: '" + to_hex(ch) + '\'');

		switch (state)
		{
			case state_Start:
				if (ch == '?')
					state = state_QuestionMarkSeen;
				else if (ch == ' ' or ch == '\n' or ch == '\t')
				{
					m_token.clear();
					state = state_DataStart;
				}
				else
					not_well_formed("a space is required before pi data");
				break;

			case state_DataStart:
				if (ch == ' ' or ch == '\n' or ch == '\t')
					m_token.clear();
				else if (ch == '?')
					state = state_QuestionMarkSeen;
				else
					state = state_Data;
				break;

			case state_Data:
				if (ch == '?')
					state = state_QuestionMarkSeen;
				break;

			case state_QuestionMarkSeen:
				if (ch == '>')
					state = state_PIClosed;
				else if (ch != '?')
					state = state_Data;
				break;

			case state_PIClosed:
				assert(false);
		}
	}

	m_token.erase(m_token.end() - 2, m_token.end());
	m_parser.processing_instruction(pi_target, m_token);

	in_content.reset();
	match(XMLToken::PI);
}

// --------------------------------------------------------------------

parser::parser(std::istream &data)
	: m_impl(new parser_imp(data, *this))
{
}

parser::~parser()
{
	delete m_impl;
	delete m_istream;
}

void parser::parse(bool validate, bool validate_ns)
{
	m_impl->parse(validate, validate_ns);
}

void parser::xml_decl(encoding_type encoding, bool standalone, version_type version)
{
	if (xml_decl_handler)
		xml_decl_handler(encoding, standalone, version);
}

void parser::start_element(std::string name, std::string uri, const std::vector<attr> &atts)
{
	if (start_element_handler)
		start_element_handler(std::move(name), std::move(uri), atts);
}

void parser::end_element(std::string name, std::string uri)
{
	if (end_element_handler)
		end_element_handler(std::move(name), std::move(uri));
}

void parser::character_data(std::string data)
{
	if (character_data_handler)
		character_data_handler(std::move(data));
}

void parser::processing_instruction(std::string target, std::string data)
{
	if (processing_instruction_handler)
		processing_instruction_handler(std::move(target), std::move(data));
}

void parser::comment(std::string data)
{
	if (comment_handler)
		comment_handler(std::move(data));
}

void parser::start_cdata_section()
{
	if (start_cdata_section_handler)
		start_cdata_section_handler();
}

void parser::end_cdata_section()
{
	if (end_cdata_section_handler)
		end_cdata_section_handler();
}

void parser::start_namespace_decl(std::string prefix, std::string uri)
{
	if (start_namespace_decl_handler)
		start_namespace_decl_handler(std::move(prefix), std::move(uri));
}

void parser::end_namespace_decl(std::string prefix)
{
	if (end_namespace_decl_handler)
		end_namespace_decl_handler(std::move(prefix));
}

void parser::doctype_decl(std::string root, std::string publicId, std::string uri)
{
	if (doctype_decl_handler)
		doctype_decl_handler(std::move(root), std::move(publicId), std::move(uri));
}

void parser::notation_decl(std::string name, std::string systemId, std::string publicId)
{
	if (notation_decl_handler)
		notation_decl_handler(std::move(name), std::move(systemId), std::move(publicId));
}

std::unique_ptr<std::istream> parser::external_entity_ref(std::string_view base, std::string_view pubid, std::string_view uri)
{
	std::unique_ptr<std::istream> result;
	if (external_entity_ref_handler)
		result = external_entity_ref_handler(base, pubid, uri);
	return result;
}

void parser::report_invalidation(std::string msg)
{
	if (report_invalidation_handler)
		report_invalidation_handler(std::move(msg));
}

} // namespace zeem
