From 87e6ff405b0d687ed341f304fba7c5b391a49359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?cel=20=F0=9F=8C=B8?= Date: Sun, 24 Nov 2024 02:05:41 +0000 Subject: [PATCH] misc --- src/declaration.rs | 21 +++++++++ src/element.rs | 19 +++++++- src/error.rs | 8 +++- src/lib.rs | 11 ++++- src/reader.rs | 108 ++++++++++++++++++++++++++++++++++----------- src/writer.rs | 75 +++++++++++++++++++++++++------ src/xml/mod.rs | 44 ++++++++++++++++-- 7 files changed, 240 insertions(+), 46 deletions(-) create mode 100644 src/declaration.rs diff --git a/src/declaration.rs b/src/declaration.rs new file mode 100644 index 0000000..2c0855f --- /dev/null +++ b/src/declaration.rs @@ -0,0 +1,21 @@ +pub struct Declaration { + pub version_info: VersionInfo, + pub encoding_decl: Option, + pub sd_decl: Option, +} + +#[derive(Clone, Copy)] +pub enum VersionInfo { + One, + OneDotOne, +} + +impl Declaration { + pub fn version(version: VersionInfo) -> Self { + Self { + version_info: version, + encoding_decl: None, + sd_decl: None, + } + } +} diff --git a/src/element.rs b/src/element.rs index 04f2e5e..2b149a8 100644 --- a/src/element.rs +++ b/src/element.rs @@ -9,8 +9,22 @@ use std::{ use crate::{ error::Error, xml::{self, parsers_complete::Parser, Attribute}, + Result, }; +pub trait FromElement: Sized { + fn from_element(element: Element) -> Result; +} + +pub trait IntoElement { + fn into_element(&self) -> Element; + + fn get_content(&self) -> Vec { + let element = self.into_element(); + element.content + } +} + // when are namespaces names chosen then if they are automatically calculated // namespaces are held by readers and writers. #[derive(PartialEq, Eq, Hash, Clone, Debug)] @@ -26,7 +40,7 @@ pub struct Name { pub local_name: String, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Content { Element(Element), Text(String), @@ -35,7 +49,7 @@ pub enum Content { } // should this be a trait? -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Element { pub name: Name, // namespace: Name, @@ -51,6 +65,7 @@ pub struct Element { // you can validate the prefix and calculate the namespace from the current reader context // this results in readers and writers being able to return qualification errors as they aren't able to create elements until every part is qualified. pub attributes: HashMap, + // TODO: make a hashmap maybe? to be able to address parts of the content individually pub content: Vec, } diff --git a/src/error.rs b/src/error.rs index 69993ed..eda527e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,6 +1,6 @@ use std::{num::ParseIntError, str::Utf8Error}; -use crate::element::{Name, NamespaceDeclaration}; +use crate::element::{Content, Name, NamespaceDeclaration}; #[derive(Debug)] pub enum Error { @@ -17,6 +17,12 @@ pub enum Error { NotInElement(String), ExtraData(String), UndeclaredNamespace(String), + IncorrectName(Name), + UnexpectedAttribute(Name), + DeserializeError(String), + UnexpectedNumberOfContents(usize), + UnexpectedContent(Content), + UnexpectedElement(Name), } impl From for Error { diff --git a/src/lib.rs b/src/lib.rs index e8486c4..2e38d4e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,16 @@ -mod element; +pub mod declaration; +pub mod element; mod error; pub mod reader; mod writer; pub mod xml; pub type Result = std::result::Result; + +pub const XML_NS: &str = "http://www.w3.org/XML/1998/namespace"; +pub const XMLNS_NS: &str = "http://www.w3.org/2000/xmlns/"; + +pub use element::Element; +pub use error::Error; +pub use reader::Reader; +pub use writer::Writer; diff --git a/src/reader.rs b/src/reader.rs index f1f3744..ee8d491 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -2,7 +2,7 @@ use circular::Buffer; use futures::{FutureExt, Stream}; use nom::Err; use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::{hash_set, BTreeMap, HashMap, HashSet}, future::Future, path::Prefix, pin::{pin, Pin}, @@ -10,19 +10,20 @@ use std::{ }; use tokio::io::{AsyncBufRead, AsyncBufReadExt, AsyncRead, AsyncReadExt}; -static MAX_STANZA_SIZE: usize = 65536; - use crate::{ - element::{Content, Element, Name, NamespaceDeclaration}, + declaration::{Declaration, VersionInfo}, + element::{Content, Element, FromElement, Name, NamespaceDeclaration}, error::Error, xml::{self, parsers::Parser}, - Result, + Result, XMLNS_NS, XML_NS, }; +static MAX_STANZA_SIZE: usize = 65536; + /// streaming reader that tracks depth and available namespaces at current depth pub struct Reader { inner: R, - buffer: Buffer, + pub buffer: Buffer, // holds which tags we are in atm over depth // to have names reference namespaces could depth: Vec, @@ -31,13 +32,27 @@ pub struct Reader { impl Reader { pub fn new(reader: R) -> Self { + let mut default_declarations = HashSet::new(); + default_declarations.insert(NamespaceDeclaration { + prefix: Some("xml".to_string()), + namespace: XML_NS.to_string(), + }); + default_declarations.insert(NamespaceDeclaration { + prefix: Some("xmlns".to_string()), + namespace: XMLNS_NS.to_string(), + }); Self { inner: reader, buffer: Buffer::with_capacity(MAX_STANZA_SIZE), depth: Vec::new(), - namespace_declarations: Vec::new(), + // TODO: make sure reserved namespaces are never overwritten + namespace_declarations: vec![default_declarations], } } + + pub fn into_inner(self) -> R { + self.inner + } } impl Reader @@ -48,18 +63,35 @@ where Ok(self.inner.read_buf(&mut self.buffer).await?) } - pub async fn read_prolog<'s>(&'s mut self) -> Result<()> { + pub async fn read_prolog<'s>(&'s mut self) -> Result> { loop { - self.read_buf().await?; let input = str::from_utf8(self.buffer.data())?; match xml::Prolog::parse(input) { - Ok((rest, _prolog)) => { + Ok((rest, (decl, _misc, _doctype_decl))) => { let len = self.buffer.available_data() - rest.as_bytes().len(); - self.buffer.consume(len); - return Ok(()); + // TODO: return error if there is a doctype decl + if let Some(decl) = decl { + let declaration = Declaration { + version_info: match *decl.version_info { + xml::VersionNum::One => VersionInfo::One, + xml::VersionNum::OneDotOne => VersionInfo::OneDotOne, + }, + encoding_decl: decl + .encoding_decl + .map(|encoding_decl| (**encoding_decl).to_string()), + sd_decl: decl.sd_decl.map(|sd_decl| *sd_decl), + }; + self.buffer.consume(len); + return Ok(Some(declaration)); + } else { + self.buffer.consume(len); + return Ok(None); + } } std::result::Result::Err(e) => match e { - Err::Incomplete(_) => {} + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -68,9 +100,18 @@ where } } + pub async fn read_start<'s, T: FromElement>(&'s mut self) -> Result { + let element = self.read_start_tag().await?; + FromElement::from_element(element) + } + + pub async fn read<'s, T: FromElement>(&'s mut self) -> Result { + let element = self.read_element().await?; + FromElement::from_element(element) + } + pub async fn read_start_tag<'s>(&'s mut self) -> Result { loop { - self.read_buf().await?; let input = str::from_utf8(self.buffer.data())?; match xml::STag::parse(input) { Ok((rest, e)) => { @@ -84,7 +125,9 @@ where return Ok(element); } std::result::Result::Err(e) => match e { - Err::Incomplete(_) => {} + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -95,7 +138,6 @@ where pub async fn read_end_tag<'s>(&'s mut self) -> Result<()> { loop { - self.read_buf().await?; let input = str::from_utf8(self.buffer.data())?; match xml::ETag::parse(input) { Ok((rest, e)) => { @@ -109,7 +151,9 @@ where return Ok(()); } std::result::Result::Err(e) => match e { - Err::Incomplete(_) => {} + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -120,7 +164,6 @@ where pub async fn read_element<'s>(&'s mut self) -> Result { loop { - self.read_buf().await?; let input = str::from_utf8(self.buffer.data())?; match xml::Element::parse(input) { Ok((rest, e)) => { @@ -131,7 +174,9 @@ where return Ok(element); } std::result::Result::Err(e) => match e { - Err::Incomplete(_) => {} + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -144,7 +189,6 @@ where let mut last_char = false; let mut text = String::new(); loop { - self.read_buf().await?; let input = str::from_utf8(self.buffer.data())?; if last_char == false { match xml::CharData::parse(input) { @@ -155,7 +199,9 @@ where last_char = true; } std::result::Result::Err(e) => match e { - Err::Incomplete(_needed) => continue, + Err::Incomplete(_) => { + self.read_buf().await?; + } _ => match xml::ContentItem::parse(input) { Ok((rest, content_item)) => match content_item { xml::ContentItem::Element(element) => { @@ -207,7 +253,9 @@ where } }, std::result::Result::Err(e) => match e { - Err::Incomplete(_) => continue, + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -263,7 +311,9 @@ where } }, std::result::Result::Err(e) => match e { - Err::Incomplete(_) => continue, + Err::Incomplete(_) => { + self.read_buf().await?; + } // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -622,7 +672,11 @@ impl Reader { for (content_item, char_data) in xml_content.content { match content_item { xml::ContentItem::Element(element) => { - text.map(|text| content.push(Content::Text(text))); + text.map(|text| { + if !text.is_empty() { + content.push(Content::Text(text)) + } + }); content.push(Content::Element(Self::element_from_xml( namespaces, element, )?)); @@ -655,7 +709,11 @@ impl Reader { } } } - text.map(|text| content.push(Content::Text(text))); + text.map(|text| { + if !text.is_empty() { + content.push(Content::Text(text)) + } + }); Ok(content) } } diff --git a/src/writer.rs b/src/writer.rs index dc5b48a..e319fdc 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -5,10 +5,11 @@ use futures::Sink; use tokio::io::{AsyncWrite, AsyncWriteExt}; use crate::{ - element::{escape_str, Content, Element, Name, NamespaceDeclaration}, + declaration::{Declaration, VersionInfo}, + element::{escape_str, Content, Element, IntoElement, Name, NamespaceDeclaration}, error::Error, - xml::{self, composers::Composer, parsers_complete::Parser, ETag}, - Result, + xml::{self, composers::Composer, parsers_complete::Parser, ETag, XMLDecl}, + Result, XMLNS_NS, XML_NS, }; // pub struct Writer { @@ -20,21 +21,69 @@ pub struct Writer { impl Writer { pub fn new(writer: W) -> Self { + let mut default_declarations = HashSet::new(); + default_declarations.insert(NamespaceDeclaration { + prefix: Some("xml".to_string()), + namespace: XML_NS.to_string(), + }); + default_declarations.insert(NamespaceDeclaration { + prefix: Some("xmlns".to_string()), + namespace: XMLNS_NS.to_string(), + }); Self { inner: writer, depth: Vec::new(), - namespace_declarations: Vec::new(), + namespace_declarations: vec![default_declarations], } } + + pub fn into_inner(self) -> W { + self.inner + } } impl Writer { + pub async fn write_declaration(&mut self, version: VersionInfo) -> Result<()> { + let declaration = Declaration::version(version); + let version_info; + match declaration.version_info { + VersionInfo::One => version_info = xml::VersionInfo::SingleQuoted(xml::VersionNum::One), + VersionInfo::OneDotOne => { + version_info = xml::VersionInfo::SingleQuoted(xml::VersionNum::OneDotOne) + } + } + let declaration = xml::XMLDecl { + version_info, + encoding_decl: None, + sd_decl: None, + }; + declaration.write(&mut self.inner).await?; + Ok(()) + } + + pub async fn write_full(&mut self, into_element: &impl IntoElement) -> Result<()> { + let element = into_element.into_element(); + Ok(self.write_element(&element).await?) + } + + pub async fn write_start(&mut self, into_element: &impl IntoElement) -> Result<()> { + let element = into_element.into_element(); + Ok(self.write_element_start(&element).await?) + } + + pub async fn write_all_content(&mut self, into_element: &impl IntoElement) -> Result<()> { + for content in &into_element.get_content() { + self.write_content(content).await?; + } + Ok(()) + } + #[async_recursion] pub async fn write_element(&mut self, element: &Element) -> Result<()> { if element.content.is_empty() { self.write_empty(element).await?; } else { - self.write_start(element).await?; + self.write_element_start(element).await?; for content in &element.content { self.write_content(content).await?; } @@ -107,12 +156,11 @@ impl Writer { if let Some(prefix) = &prefix { att_name = xml::QName::PrefixedName(xml::PrefixedName { prefix: xml::Prefix::parse_full(prefix)?, - local_part: xml::LocalPart::parse_full(&element.name.local_name)?, + local_part: xml::LocalPart::parse_full(&name.local_name)?, }) } else { - att_name = xml::QName::UnprefixedName(xml::UnprefixedName::parse_full( - &element.name.local_name, - )?) + att_name = + xml::QName::UnprefixedName(xml::UnprefixedName::parse_full(&name.local_name)?) } let value = xml::AttValue::from(value.as_str()); @@ -131,7 +179,7 @@ impl Writer { Ok(()) } - pub async fn write_start(&mut self, element: &Element) -> Result<()> { + pub async fn write_element_start(&mut self, element: &Element) -> Result<()> { let namespace_declarations_stack: Vec<_> = self .namespace_declarations .iter() @@ -195,12 +243,11 @@ impl Writer { if let Some(prefix) = &prefix { att_name = xml::QName::PrefixedName(xml::PrefixedName { prefix: xml::Prefix::parse_full(prefix)?, - local_part: xml::LocalPart::parse_full(&element.name.local_name)?, + local_part: xml::LocalPart::parse_full(&name.local_name)?, }) } else { - att_name = xml::QName::UnprefixedName(xml::UnprefixedName::parse_full( - &element.name.local_name, - )?) + att_name = + xml::QName::UnprefixedName(xml::UnprefixedName::parse_full(&name.local_name)?) } let value = xml::AttValue::from(value.as_str()); diff --git a/src/xml/mod.rs b/src/xml/mod.rs index 3150df0..43f3027 100644 --- a/src/xml/mod.rs +++ b/src/xml/mod.rs @@ -389,9 +389,9 @@ pub type Prolog<'s> = ( /// [23] XMLDecl ::= '' #[derive(Debug)] pub struct XMLDecl<'s> { - version_info: VersionInfo, - encoding_decl: Option>, - sd_decl: Option, + pub(crate) version_info: VersionInfo, + pub(crate) encoding_decl: Option>, + pub(crate) sd_decl: Option, } /// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') @@ -401,6 +401,17 @@ pub enum VersionInfo { DoubleQuoted(VersionNum), } +impl Deref for VersionInfo { + type Target = VersionNum; + + fn deref(&self) -> &Self::Target { + match self { + VersionInfo::SingleQuoted(version_num) => version_num, + VersionInfo::DoubleQuoted(version_num) => version_num, + } + } +} + /// [25] Eq ::= S? '=' S? #[derive(Clone)] pub struct Eq; @@ -479,6 +490,17 @@ pub enum SDDecl { DoubleQuoted(bool), } +impl Deref for SDDecl { + type Target = bool; + + fn deref(&self) -> &Self::Target { + match self { + SDDecl::SingleQuoted(b) => b, + SDDecl::DoubleQuoted(b) => b, + } + } +} + // (Productions 33 through 38 have been removed.) /// [39] element ::= EmptyElemTag | STag content ETag @@ -846,10 +868,26 @@ pub struct ExtParsedEnt<'s> { // TODO?: select quote version pub struct EncodingDecl<'s>(EncName<'s>); +impl<'s> Deref for EncodingDecl<'s> { + type Target = EncName<'s>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* #[derive(Debug)] pub struct EncName<'s>(&'s str); +impl<'s> Deref for EncName<'s> { + type Target = &'s str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[derive(Debug)] pub enum NotationDeclID<'s> { External(ExternalID<'s>),