From be50ab4890993ae97bc79138364cd5e316566e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?cel=20=F0=9F=8C=B8?= Date: Sun, 10 Nov 2024 14:31:43 +0000 Subject: [PATCH] implement element reading --- Cargo.lock | 12 +- Cargo.toml | 1 + src/element.rs | 44 +++++-- src/error.rs | 17 ++- src/lib.rs | 2 + src/main.rs | 87 ++++++++++++- src/reader.rs | 325 ++++++++++++++++++++++++++++++++++++++++++++++++- src/xml/mod.rs | 241 +++++++++++++++++++++++++++++++++--- 8 files changed, 692 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f2e8d8..215071a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,9 +46,9 @@ checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "bytes" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "cc" @@ -62,6 +62,13 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "circular" +version = "0.3.0" +dependencies = [ + "bytes", +] + [[package]] name = "futures" version = "0.3.30" @@ -265,6 +272,7 @@ dependencies = [ name = "peanuts" version = "0.1.0" dependencies = [ + "circular", "futures", "nom", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 76552ac..5586a6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +circular = { version = "0.3.0", path = "../circular" } futures = "0.3.30" nom = "7.1.3" tokio = { version = "1.36.0", features = ["io-util", "net", "io-std", "full"] } diff --git a/src/element.rs b/src/element.rs index 35d73a3..0e0b8f1 100644 --- a/src/element.rs +++ b/src/element.rs @@ -1,23 +1,32 @@ // elements resemble a final tree, including inherited namespace information -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; + +use crate::{ + error::Error, + xml::{self, Attribute}, +}; // when are namespaces names chosen then if they are automatically calculated // namespaces are held by readers and writers. +#[derive(PartialEq, Eq, Hash, Clone)] pub struct Namespace { - prefix: Option, - namespace: String, + pub prefix: Option, + pub namespace: String, } // names are qualified, they contain a reference to the namespace (held within the reader/writer) +#[derive(PartialEq, Eq, Hash, Clone)] pub struct Name { - namespace: String, - name: String, + pub namespace: Namespace, + pub name: String, } -pub enum Node { +pub enum Content { Element(Element), Text(String), + PI(String), + Comment(String), } // should this be a trait? @@ -29,16 +38,35 @@ pub struct Element { // namespace: String, // hashmap of explicit namespace declarations on the element itself only // possibly not needed as can be calculated at write time depending on context and qualified namespace, and for reading, element validity and namespaces are kept track of by the reader. - pub namespace_decl: HashMap, String>, + pub namespace_decl: HashSet, // attributes can be in a different namespace than the element. how to make sure they are valid? // maybe include the namespace instead of or with the prefix // you can calculate the prefix from the namespaced name and the current writer context // you can validate the prefix and calculate the namespace from the current reader context // this results in readers and writers being able to return qualification errors as they aren't able to create elements until every part is qualified. pub attributes: HashMap, - pub children: Option>, + pub content: Vec, } +// impl<'s> TryFrom> for Element<'s> { +// type Error = Error; + +// fn try_from(xml_element: xml::Element) -> Result { +// match &xml_element { +// xml::Element::Empty(empty_elem_tag) => { +// let namespace_decl; +// let attributes; +// empty_elem_tag +// .attributes +// .into_iter() +// .filter(|attribute| matches!(attribute, Attribute::NamespaceDeclaration(_))); +// todo!() +// } +// xml::Element::NotEmpty(stag, content, etag) => todo!(), +// } +// } +// } + // example of deriving an element: // #[derive(XMLWrite, XMLRead)] diff --git a/src/error.rs b/src/error.rs index 78508ae..96c709c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,9 +1,18 @@ -use std::str::Utf8Error; +use std::{num::ParseIntError, str::Utf8Error}; + +use crate::element::{Name, Namespace}; pub enum Error { ReadError(std::io::Error), Utf8Error(Utf8Error), ParseError(String), + EntityProcessError(String), + // TODO: better choice for failures than string + InvalidCharRef(String), + DuplicateNameSpace(Namespace), + DuplicateAttribute(String), + UnqualifiedNamespace(String), + MismatchedEndTag(String, String), } impl From for Error { @@ -17,3 +26,9 @@ impl From for Error { Self::Utf8Error(e) } } + +impl From for Error { + fn from(e: ParseIntError) -> Self { + Self::InvalidCharRef(e.to_string()) + } +} diff --git a/src/lib.rs b/src/lib.rs index dcf14fe..329c092 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,3 +3,5 @@ mod error; mod reader; mod writer; pub mod xml; + +pub type Result = std::result::Result; diff --git a/src/main.rs b/src/main.rs index ea86e07..580652e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,12 +5,87 @@ use peanuts::xml::Document; #[tokio::main] async fn main() { let (rest, document) = Document::parse( - " - - Background Mark 1 - Background Mark 2 - Background Mark 3 -ahsdkjlfhasdlkjfhkljh + " + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + asdf ", ) .unwrap(); diff --git a/src/reader.rs b/src/reader.rs index 313de4c..b51489f 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,31 +1,348 @@ +use circular::Buffer; use futures::Stream; use nom::Err; -use std::{collections::BTreeMap, str}; -use tokio::io::AsyncBufReadExt; +use std::{ + collections::{BTreeMap, HashMap, HashSet}, + path::Prefix, + str::{self, FromStr}, +}; +use tokio::io::{AsyncBufRead, AsyncBufReadExt, AsyncRead, AsyncReadExt}; + +static MAX_STANZA_SIZE: usize = 65536; use crate::{ - element::{Element, Name, Namespace}, + element::{Content, Element, Name, Namespace}, error::Error, + xml::{self, parsers::Parser}, + Result, }; /// streaming reader that tracks depth and available namespaces at current depth pub struct Reader { inner: R, + buffer: Buffer, // holds which tags we are in atm over depth + // to have names reference namespaces could depth: Vec, - namespaces: Vec<(usize, Namespace)>, + namespaces: Vec>, } impl Reader { pub fn new(reader: R) -> Self { Self { inner: reader, + buffer: Buffer::with_capacity(MAX_STANZA_SIZE), depth: Vec::new(), namespaces: Vec::new(), } } } +impl Reader +where + R: AsyncRead + Unpin, +{ + async fn read_buf(&mut self) -> Result { + Ok(self.inner.read_buf(&mut self.buffer).await?) + } + + async fn read_element<'s>(&'s mut self) -> Result { + self.read_buf().await?; + let mut input = str::from_utf8(self.buffer.data())?; + loop { + match xml::Element::parse(input) { + Ok((rest, e)) => { + let len = self.buffer.available_data() - rest.as_bytes().len(); + let element = Reader::::element_from_xml(&mut self.namespaces, e)?; + self.buffer.consume(len); + return Ok(element); + } + std::result::Result::Err(e) => match e { + Err::Incomplete(_) => { + self.read_buf().await?; + input = str::from_utf8(self.buffer.data())?; + } + // TODO: better error + Err::Error(e) => return Err(Error::ParseError(e.to_string())), + Err::Failure(e) => return Err(Error::ParseError(e.to_string())), + }, + } + } + } +} + +impl Reader { + fn element_from_xml( + namespaces: &mut Vec>, + element: xml::Element, + ) -> Result { + match element { + xml::Element::Empty(empty_elem_tag) => { + let mut namespace_declarations = HashSet::new(); + for (prefix, namespace) in + empty_elem_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::NamespaceDeclaration { ns_name, value } = attribute { + Some((ns_name, value)) + } else { + None + } + }) + { + let prefix = match prefix { + xml::NSAttName::PrefixedAttName(prefixed_att_name) => { + Some(prefixed_att_name.to_string()) + } + xml::NSAttName::DefaultAttName => None, + }; + let namespace = Namespace { + prefix, + namespace: namespace.process()?, + }; + if !namespace_declarations.insert(namespace.clone()) { + return Err(Error::DuplicateNameSpace(namespace)); + } + } + + // all namespaces available to the element (from both parent elements and element itself) + let namespace_stack: Vec<&Namespace> = namespaces + .iter() + .flatten() + .chain(namespace_declarations.iter()) + .collect(); + + let mut attributes = HashMap::new(); + + for (q_name, value) in empty_elem_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::Attribute { name, value } = attribute { + Some((name, value)) + } else { + None + } + }) { + let namespace; + let attribute_name; + match q_name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack.iter().rfind(|namespace| { + namespace.prefix.as_deref() == Some(**prefixed_name.prefix) + }); + attribute_name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + attribute_name = unprefixed_name.to_string(); + } + } + if let Some(namespace) = namespace { + let namespace = (*namespace).clone(); + let name = Name { + namespace, + name: attribute_name, + }; + let value = value.process()?; + if let Some(_value) = attributes.insert(name, value) { + return Err(Error::DuplicateAttribute(q_name.to_string())); + } + } else { + return Err(Error::UnqualifiedNamespace(q_name.to_string())); + } + } + + let name; + let namespace; + match &empty_elem_tag.name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack.iter().rfind(|namespace| { + namespace.prefix.as_deref() == Some(**prefixed_name.prefix) + }); + name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + name = unprefixed_name.to_string(); + } + } + + let namespace = (*namespace + .ok_or_else(|| Error::UnqualifiedNamespace(empty_elem_tag.name.to_string()))?) + .clone(); + + let name = Name { namespace, name }; + + return Ok(Element { + name, + namespace_decl: namespace_declarations, + attributes, + content: Vec::new(), + }); + } + xml::Element::NotEmpty(s_tag, content, e_tag) => { + if s_tag.name != e_tag.name { + return Err(Error::MismatchedEndTag( + s_tag.name.to_string(), + e_tag.name.to_string(), + )); + } + let mut namespace_declarations = HashSet::new(); + for (prefix, namespace) in s_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::NamespaceDeclaration { ns_name, value } = attribute { + Some((ns_name, value)) + } else { + None + } + }) { + let prefix = match prefix { + xml::NSAttName::PrefixedAttName(prefixed_att_name) => { + Some(prefixed_att_name.to_string()) + } + xml::NSAttName::DefaultAttName => None, + }; + let namespace = Namespace { + prefix, + namespace: namespace.process()?, + }; + if !namespace_declarations.insert(namespace.clone()) { + return Err(Error::DuplicateNameSpace(namespace)); + } + } + + // all namespaces available to the element (from both parent elements and element itself) + let namespace_stack: Vec<&Namespace> = namespaces + .iter() + .flatten() + .chain(namespace_declarations.iter()) + .collect(); + + let mut attributes = HashMap::new(); + + for (q_name, value) in s_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::Attribute { name, value } = attribute { + Some((name, value)) + } else { + None + } + }) { + let namespace; + let attribute_name; + match q_name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack.iter().rfind(|namespace| { + namespace.prefix.as_deref() == Some(**prefixed_name.prefix) + }); + attribute_name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + attribute_name = unprefixed_name.to_string(); + } + } + if let Some(namespace) = namespace { + let namespace = (*namespace).clone(); + let name = Name { + namespace, + name: attribute_name, + }; + let value = value.process()?; + if let Some(_value) = attributes.insert(name, value) { + return Err(Error::DuplicateAttribute(q_name.to_string())); + } + } else { + return Err(Error::UnqualifiedNamespace(q_name.to_string())); + } + } + + let name; + let namespace; + match &s_tag.name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack.iter().rfind(|namespace| { + namespace.prefix.as_deref() == Some(**prefixed_name.prefix) + }); + name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + name = unprefixed_name.to_string(); + } + } + + let namespace = (*namespace + .ok_or_else(|| Error::UnqualifiedNamespace(s_tag.name.to_string()))?) + .clone(); + + let name = Name { namespace, name }; + + namespaces.push(namespace_declarations.clone()); + + let content = Self::content_from_xml(namespaces, content)?; + + namespaces.pop(); + + return Ok(Element { + name, + namespace_decl: namespace_declarations, + attributes, + content, + }); + } + } + } + + fn content_from_xml( + namespaces: &mut Vec>, + element: xml::Content, + ) -> Result> { + let mut content = Vec::new(); + let mut text = element.char_data.map(|str| String::from(*str)); + for (content_item, char_data) in element.content { + match content_item { + xml::ContentItem::Element(element) => { + text.map(|text| content.push(Content::Text(text))); + content.push(Content::Element(Self::element_from_xml( + namespaces, element, + )?)); + text = char_data.map(|str| String::from(*str)); + } + xml::ContentItem::Reference(reference) => { + let data = reference.process()?; + if let Some(text) = &mut text { + text.push(data) + } else { + text = Some(String::from(data)) + } + char_data.map(|char_data| text.as_mut().map(|s| s.push_str(*char_data))); + } + xml::ContentItem::CDSect(cd_sect) => { + if let Some(text) = &mut text { + text.push_str(**cd_sect) + } else { + text = Some(String::from(**cd_sect)) + } + char_data.map(|char_data| text.as_mut().map(|s| s.push_str(*char_data))); + } + // TODO: is this important? + xml::ContentItem::PI(pi) => { + char_data.map(|char_data| text.as_mut().map(|s| s.push_str(*char_data))); + } + // TODO: comments? + xml::ContentItem::Comment(comment) => { + char_data.map(|char_data| text.as_mut().map(|s| s.push_str(*char_data))); + } + } + } + text.map(|text| content.push(Content::Text(text))); + todo!() + } +} + // impl Reader // where // R: AsyncBufReadExt + Unpin, diff --git a/src/xml/mod.rs b/src/xml/mod.rs index f072fde..221c334 100644 --- a/src/xml/mod.rs +++ b/src/xml/mod.rs @@ -1,4 +1,6 @@ -use std::char; +use std::{char, ops::Deref}; + +use crate::error::Error; pub mod composers; pub mod parsers; @@ -14,40 +16,91 @@ pub enum NSAttName<'s> { #[derive(Clone, Debug)] pub struct PrefixedAttName<'s>(NCName<'s>); +impl<'s> Deref for PrefixedAttName<'s> { + type Target = NCName<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [3] DefaultAttName ::= 'xmlns'; #[derive(Clone, Debug)] pub struct DefaultAttName; /// [4] NCName ::= Name - (Char* ':' Char*) -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct NCName<'s>(&'s str); +impl<'s> Deref for NCName<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [7] QName ::= PrefixedName | UnprefixedName -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum QName<'s> { PrefixedName(PrefixedName<'s>), UnprefixedName(UnprefixedName<'s>), } +impl<'s> ToString for QName<'s> { + fn to_string(&self) -> String { + match self { + QName::PrefixedName(prefixed_name) => { + format!("{}:{}", **prefixed_name.prefix, **prefixed_name.local_part) + } + QName::UnprefixedName(unprefixed_name) => unprefixed_name.to_string(), + } + } +} + /// [8] PrefixedName ::= Prefix ':' LocalPart -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct PrefixedName<'s> { - prefix: Prefix<'s>, - local_part: LocalPart<'s>, + pub(crate) prefix: Prefix<'s>, + pub(crate) local_part: LocalPart<'s>, } /// [9] UnprefixedName ::= LocalPart -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct UnprefixedName<'s>(LocalPart<'s>); +impl<'s> Deref for UnprefixedName<'s> { + type Target = LocalPart<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [10] Prefix ::= NCName -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct Prefix<'s>(NCName<'s>); +impl<'s> Deref for Prefix<'s> { + type Target = NCName<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [11] LocalPart ::= NCName -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct LocalPart<'s>(NCName<'s>); +impl<'s> Deref for LocalPart<'s> { + type Target = NCName<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + // xml spec /// [1] document ::= prolog element Misc* @@ -57,6 +110,14 @@ pub type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); #[repr(transparent)] pub struct Char(char); +impl Deref for Char { + type Target = char; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [3] S ::= (#x20 | #x9 | #xD | #xA)+ #[derive(Clone)] #[repr(transparent)] @@ -66,28 +127,76 @@ pub struct S; #[repr(transparent)] pub struct NameStartChar(char); +impl Deref for NameStartChar { + type Target = char; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] #[repr(transparent)] pub struct NameChar(char); +impl Deref for NameChar { + type Target = char; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [5] Name ::= NameStartChar (NameChar)* #[derive(Debug, Clone, PartialEq, Eq)] #[repr(transparent)] pub struct Name<'s>(&'s str); +impl<'s> Deref for Name<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [6] Names ::= Name (#x20 Name)* #[repr(transparent)] pub struct Names<'s>(Vec>); +impl<'s> Deref for Names<'s> { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [7] Nmtoken ::= (NameChar)+ #[derive(Debug, Clone)] #[repr(transparent)] pub struct Nmtoken<'s>(&'s str); +impl<'s> Deref for Nmtoken<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* #[repr(transparent)] pub struct Nmtokens<'s>(Vec>); +impl<'s> Deref for Nmtokens<'s> { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[derive(Clone, Debug)] pub enum EntityValueData<'s> { String(&'s str), @@ -115,6 +224,24 @@ pub enum AttValue<'s> { SingleQuoted(Vec>), } +impl<'s> AttValue<'s> { + pub fn process(&self) -> crate::Result { + let mut output = String::new(); + let data; + match self { + AttValue::DoubleQuoted(vec) => data = vec, + AttValue::SingleQuoted(vec) => data = vec, + } + for data in data { + match data { + AttValueData::String(s) => output.push_str(s), + AttValueData::Reference(reference) => output.push(reference.process()?), + } + } + Ok(output) + } +} + /// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") #[derive(Debug)] pub enum SystemLiteral<'s> { @@ -138,11 +265,27 @@ pub struct PubidChar(char); #[repr(transparent)] pub struct CharData<'s>(&'s str); +impl<'s> Deref for CharData<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [15] Comment ::= '' #[derive(Clone, Debug, PartialEq, Eq)] #[repr(transparent)] pub struct Comment<'s>(&'s str); +impl<'s> Deref for Comment<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [16] PI ::= '' Char*)))? '?>' #[derive(Clone, Debug)] pub struct PI<'s> { @@ -160,6 +303,14 @@ pub struct PITarget<'s>(Name<'s>); #[repr(transparent)] pub struct CDSect<'s>(CData<'s>); +impl<'s> Deref for CDSect<'s> { + type Target = CData<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [19] CDStart ::= '(&'s str); +impl<'s> Deref for CData<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [21] CDEnd ::= ']]>' #[derive(Clone, Debug, PartialEq, Eq)] pub struct CDEnd; @@ -286,8 +445,8 @@ pub enum Element<'s> { /// [40] STag ::= '<' Name (S Attribute)* S? '>' #[derive(Debug, Clone)] pub struct STag<'s> { - name: QName<'s>, - attributes: Vec>, + pub(crate) name: QName<'s>, + pub(crate) attributes: Vec>, } /// [15] Attribute ::= NSAttName Eq AttValue | QName Eq AttValue @@ -309,7 +468,7 @@ pub enum Attribute<'s> { /// [42] ETag ::= '' #[derive(Debug, Clone)] pub struct ETag<'s> { - name: QName<'s>, + pub(crate) name: QName<'s>, } #[derive(Debug, Clone)] @@ -324,16 +483,16 @@ pub enum ContentItem<'s> { /// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* #[derive(Debug, Clone)] pub struct Content<'s> { - char_data: Option>, - content: Vec<(ContentItem<'s>, Option>)>, + pub(crate) char_data: Option>, + pub(crate) content: Vec<(ContentItem<'s>, Option>)>, } /// [14] EmptyElemTag ::= '<' QName (S Attribute)* S? '/>' /// [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec] #[derive(Debug, Clone)] pub struct EmptyElemTag<'s> { - name: QName<'s>, - attributes: Vec>, + pub(crate) name: QName<'s>, + pub(crate) attributes: Vec>, } /// [17] elementdecl ::= '' @@ -503,6 +662,32 @@ pub enum CharRef<'s> { Hexadecimal(&'s str), } +impl<'s> CharRef<'s> { + pub fn process(&self) -> crate::Result { + let int: u32; + match self { + CharRef::Decimal(dec) => { + int = dec.parse()?; + } + CharRef::Hexadecimal(hex) => { + int = ::from_str_radix(hex, 16)?; + } + } + let c = std::char::from_u32(int); + + let c = c.ok_or_else(|| Error::InvalidCharRef(int.to_string()))?; + if matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}') + { + return Ok(c); + } else { + return Err(Error::InvalidCharRef(format!( + "{} is not a valid xml char", + c + ))); + }; + } +} + /// [67] Reference ::= EntityRef | CharRef #[derive(Clone, Debug)] pub enum Reference<'s> { @@ -510,10 +695,34 @@ pub enum Reference<'s> { CharRef(CharRef<'s>), } +impl<'s> Reference<'s> { + pub fn process(&self) -> crate::Result { + match self { + Reference::EntityRef(entity_ref) => match *entity_ref.deref().deref() { + "amp" => Ok('&'), + "lt" => Ok('<'), + "gt" => Ok('>'), + "apos" => Ok('\''), + "quot" => Ok('"'), + e => return Err(Error::EntityProcessError(e.to_string())), + }, + Reference::CharRef(char_ref) => Ok(char_ref.process()?), + } + } +} + /// [68] EntityRef ::= '&' Name ';' #[derive(Clone, Debug)] pub struct EntityRef<'s>(Name<'s>); +impl<'s> Deref for EntityRef<'s> { + type Target = Name<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [69] PEReference ::= '%' Name ';' #[derive(Clone, Debug)] #[repr(transparent)]