diff --git a/src/error.rs b/src/error.rs index 1f9c1e6..c84c7d0 100644 --- a/src/error.rs +++ b/src/error.rs @@ -14,6 +14,7 @@ pub enum Error { DuplicateAttribute(String), UnqualifiedNamespace(String), MismatchedEndTag(String, String), + NotInElement(String), } impl From for Error { diff --git a/src/reader.rs b/src/reader.rs index bca8edd..dc16d31 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,9 +1,11 @@ use circular::Buffer; -use futures::Stream; +use futures::{FutureExt, Stream}; use nom::Err; use std::{ collections::{BTreeMap, HashMap, HashSet}, + future::Future, path::Prefix, + pin::{pin, Pin}, str::{self, FromStr}, }; use tokio::io::{AsyncBufRead, AsyncBufReadExt, AsyncRead, AsyncReadExt}; @@ -42,14 +44,77 @@ impl Reader where R: AsyncRead + Unpin, { - async fn read_buf(&mut self) -> Result { + async fn read_buf<'s>(&mut self) -> Result { Ok(self.inner.read_buf(&mut self.buffer).await?) } - async fn read_element<'s>(&'s mut self) -> Result { - self.read_buf().await?; - let mut input = str::from_utf8(self.buffer.data())?; + async fn read_prolog<'s>(&'s mut self) -> Result<()> { loop { + self.read_buf().await?; + let input = str::from_utf8(self.buffer.data())?; + match xml::Prolog::parse(input) { + Ok((rest, _prolog)) => { + let len = self.buffer.available_data() - rest.as_bytes().len(); + self.buffer.consume(len); + return Ok(()); + } + std::result::Result::Err(e) => match e { + Err::Incomplete(_) => {} + // TODO: better error + Err::Error(e) => return Err(Error::ParseError(e.to_string())), + Err::Failure(e) => return Err(Error::ParseError(e.to_string())), + }, + } + } + } + + async fn read_start_tag<'s>(&'s mut self) -> Result { + loop { + self.read_buf().await?; + let input = str::from_utf8(self.buffer.data())?; + match xml::STag::parse(input) { + Ok((rest, e)) => { + let len = self.buffer.available_data() - rest.as_bytes().len(); + let element = + Reader::::start_tag_from_xml(&mut self.depth, &mut self.namespaces, e)?; + self.buffer.consume(len); + return Ok(element); + } + std::result::Result::Err(e) => match e { + Err::Incomplete(_) => {} + // TODO: better error + Err::Error(e) => return Err(Error::ParseError(e.to_string())), + Err::Failure(e) => return Err(Error::ParseError(e.to_string())), + }, + } + } + } + + async fn read_end_tag<'s>(&'s mut self) -> Result<()> { + loop { + self.read_buf().await?; + let input = str::from_utf8(self.buffer.data())?; + match xml::ETag::parse(input) { + Ok((rest, e)) => { + let len = self.buffer.available_data() - rest.as_bytes().len(); + Reader::::end_tag_from_xml(&mut self.depth, &mut self.namespaces, e)?; + self.buffer.consume(len); + return Ok(()); + } + std::result::Result::Err(e) => match e { + Err::Incomplete(_) => {} + // TODO: better error + Err::Error(e) => return Err(Error::ParseError(e.to_string())), + Err::Failure(e) => return Err(Error::ParseError(e.to_string())), + }, + } + } + } + + async fn read_element<'s>(&'s mut self) -> Result { + loop { + self.read_buf().await?; + let input = str::from_utf8(self.buffer.data())?; match xml::Element::parse(input) { Ok((rest, e)) => { let len = self.buffer.available_data() - rest.as_bytes().len(); @@ -58,10 +123,37 @@ where return Ok(element); } std::result::Result::Err(e) => match e { - Err::Incomplete(_) => { - self.read_buf().await?; - input = str::from_utf8(self.buffer.data())?; + Err::Incomplete(_) => {} + // TODO: better error + Err::Error(e) => return Err(Error::ParseError(e.to_string())), + Err::Failure(e) => return Err(Error::ParseError(e.to_string())), + }, + } + } + } + + async fn read_content<'s>(&'s mut self) -> Result { + loop { + self.read_buf().await?; + let input = str::from_utf8(self.buffer.data())?; + + match xml::ContentItem::parse(input) { + Ok((rest, c)) => { + match c { + xml::ContentItem::CharData(char_data) => todo!(), + xml::ContentItem::Element(element) => todo!(), + xml::ContentItem::Reference(reference) => todo!(), + xml::ContentItem::CDSect(cdsect) => todo!(), + xml::ContentItem::PI(pi) => todo!(), + xml::ContentItem::Comment(comment) => todo!(), } + let len = self.buffer.available_data() - rest.as_bytes().len(); + let content = Reader::::content_item_from_xml(&mut self.namespaces, e)?; + self.buffer.consume(len); + return Ok(element); + } + std::result::Result::Err(e) => match e { + Err::Incomplete(_) => {} // TODO: better error Err::Error(e) => return Err(Error::ParseError(e.to_string())), Err::Failure(e) => return Err(Error::ParseError(e.to_string())), @@ -72,6 +164,145 @@ where } impl Reader { + fn content_item_from_xml( + namespaces: &mut Vec>, + item: xml::ContentItem, + ) -> Result { + todo!() + } + + fn start_tag_from_xml( + depth: &mut Vec, + namespaces: &mut Vec>, + s_tag: xml::STag, + ) -> Result { + let mut namespace_declarations = HashSet::new(); + for (prefix, namespace) in s_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::NamespaceDeclaration { ns_name, value } = attribute { + Some((ns_name, value)) + } else { + None + } + }) { + let prefix = match prefix { + xml::NSAttName::PrefixedAttName(prefixed_att_name) => { + Some(prefixed_att_name.to_string()) + } + xml::NSAttName::DefaultAttName => None, + }; + let namespace = Namespace { + prefix, + namespace: namespace.process()?, + }; + if !namespace_declarations.insert(namespace.clone()) { + return Err(Error::DuplicateNameSpace(namespace)); + } + } + + // all namespaces available to the element (from both parent elements and element itself) + let namespace_stack: Vec<&Namespace> = namespaces + .iter() + .flatten() + .chain(namespace_declarations.iter()) + .collect(); + + let mut attributes = HashMap::new(); + + for (q_name, value) in s_tag.attributes.iter().filter_map(|attribute| { + if let xml::Attribute::Attribute { name, value } = attribute { + Some((name, value)) + } else { + None + } + }) { + let namespace; + let attribute_name; + match q_name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack.iter().rfind(|namespace| { + namespace.prefix.as_deref() == Some(**prefixed_name.prefix) + }); + attribute_name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + attribute_name = unprefixed_name.to_string(); + } + } + if let Some(namespace) = namespace { + let namespace = (*namespace).clone(); + let name = Name { + namespace, + name: attribute_name, + }; + let value = value.process()?; + if let Some(_value) = attributes.insert(name, value) { + return Err(Error::DuplicateAttribute(q_name.to_string())); + } + } else { + return Err(Error::UnqualifiedNamespace(q_name.to_string())); + } + } + + let name; + let namespace; + match &s_tag.name { + xml::QName::PrefixedName(prefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix.as_deref() == Some(**prefixed_name.prefix)); + name = prefixed_name.local_part.to_string(); + } + xml::QName::UnprefixedName(unprefixed_name) => { + namespace = namespace_stack + .iter() + .rfind(|namespace| namespace.prefix == None); + name = unprefixed_name.to_string(); + } + } + + let namespace = (*namespace + .ok_or_else(|| Error::UnqualifiedNamespace(s_tag.name.to_string()))?) + .clone(); + + let name = Name { namespace, name }; + + depth.push(name.clone()); + + namespaces.push(namespace_declarations.clone()); + + return Ok(Element { + name, + namespace_decl: namespace_declarations, + attributes, + content: Vec::new(), + }); + } + + fn end_tag_from_xml( + depth: &mut Vec, + namespaces: &mut Vec>, + e_tag: xml::ETag, + ) -> Result<()> { + if let Some(s_tag_name) = depth.pop() { + if s_tag_name.namespace.prefix.as_deref() == e_tag.name.prefix() + && s_tag_name.name == e_tag.name.local_part() + { + namespaces.pop(); + return Ok(()); + } else { + return Err(Error::MismatchedEndTag( + s_tag_name.name, + e_tag.name.to_string(), + )); + } + } else { + return Err(Error::NotInElement(e_tag.name.to_string())); + } + } + fn element_from_xml( namespaces: &mut Vec>, element: xml::Element, @@ -343,88 +574,22 @@ impl Reader { } } -// impl Reader -// where -// R: AsyncBufReadExt + Unpin, -// { -// /// could resursively read and include namespace tree with values to be shadowed within new local context -// async fn read_recursive(&mut self, namespaces: BTreeMap, String>) -> Result { -// let element; -// let len; -// loop { -// let buf = self.inner.fill_buf().await?; -// let input = str::from_utf8(buf)?; -// match crate::xml::element(input) { -// Ok((rest, e)) => { -// element = e; -// len = buf.len() - rest.len(); -// break; -// } -// Err(e) => match e { -// Err::Incomplete(_) => (), -// e => return Err::(Error::ParseError(input.to_owned())), -// }, -// } -// } +impl Stream for Reader { + type Item = Result; -// let final; -// match element { -// crate::xml::Element::Empty(e) => { -// let final = Element { - -// } -// }, -// crate::xml::Element::NotEmpty(_, _, _) => todo!(), -// } - -// self.inner.consume(len); -// todo!() -// } -// /// reads entire next prolog, element, or misc -// pub async fn read>(&mut self) -> Result { -// let element; -// let len; -// loop { -// let buf = self.inner.fill_buf().await?; -// let input = str::from_utf8(buf)?; -// match crate::xml::element(input) { -// Ok((rest, e)) => { -// element = e; -// len = buf.len() - rest.len(); -// break; -// } -// Err(e) => match e { -// Err::Incomplete(_) => (), -// e => return Err::(Error::ParseError(input.to_owned())), -// }, -// } -// } -// self.inner.consume(len); - -// // Ok(element) -// todo!() -// } -// pub async fn read_start(&self) -> Result, Error> { -// todo!() -// } -// pub async fn read_end(&self) -> Result<(), Error> { -// todo!() -// } -// } - -// impl Stream for Reader { -// type Item = impl From; - -// async fn poll_next( -// self: std::pin::Pin<&mut Self>, -// cx: &mut std::task::Context<'_>, -// ) -> std::task::Poll> { -// todo!() -// } -// } + fn poll_next( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let mut e = self; + let mut pinned = pin!(e.read_content()); + pinned.as_mut().poll(cx).map(|result| Some(result)) + } +} #[cfg(test)] mod test { + use futures::{sink::Buffer, StreamExt}; use tokio::io::AsyncRead; use super::Reader; @@ -448,10 +613,7 @@ mod test { } } - #[tokio::test] - async fn test_element_read() { - let mock = MockAsyncReader::new( - "asdf"; + + #[tokio::test] + async fn test_element_read() { + let mock = MockAsyncReader::new(TEST_DOC); let mut reader = Reader::new(mock); let element = reader.read_element().await.unwrap(); println!("{:#?}", element); } + + #[tokio::test] + async fn test_element_stream() { + let mock = MockAsyncReader::new(TEST_DOC); + let mut reader = Reader::new(mock); + let element = reader.read_start_tag().await.unwrap(); + println!("{:#?}", element); + loop { + let element = reader.next().await.unwrap(); + println!("{:#?}", element); + } + } } diff --git a/src/xml/composers.rs b/src/xml/composers.rs index 949bb65..b8fbe13 100644 --- a/src/xml/composers.rs +++ b/src/xml/composers.rs @@ -817,6 +817,7 @@ impl<'s> Composer<'s> for Content<'s> { ContentItem::CDSect(cd_sect) => cd_sect.write(writer).await?, ContentItem::PI(pi) => pi.write(writer).await?, ContentItem::Comment(comment) => comment.write(writer).await?, + _ => todo!("verify no split chardata"), } if let Some(char_data) = char_data { char_data.write(writer).await?; diff --git a/src/xml/mod.rs b/src/xml/mod.rs index 221c334..9424d0b 100644 --- a/src/xml/mod.rs +++ b/src/xml/mod.rs @@ -47,6 +47,22 @@ pub enum QName<'s> { UnprefixedName(UnprefixedName<'s>), } +impl<'s> QName<'s> { + pub fn prefix(&self) -> Option<&'s str> { + match self { + QName::PrefixedName(prefixed_name) => return Some(**prefixed_name.prefix), + QName::UnprefixedName(_) => return None, + } + } + + pub fn local_part(&self) -> &str { + match self { + QName::PrefixedName(prefixed_name) => return **prefixed_name.local_part, + QName::UnprefixedName(unprefixed_name) => return ****unprefixed_name, + } + } +} + impl<'s> ToString for QName<'s> { fn to_string(&self) -> String { match self { @@ -473,7 +489,7 @@ pub struct ETag<'s> { #[derive(Debug, Clone)] pub enum ContentItem<'s> { - // CharData(&'s str), + CharData(CharData<'s>), Element(Element<'s>), Reference(Reference<'s>), CDSect(CDSect<'s>), diff --git a/src/xml/parsers.rs b/src/xml/parsers.rs index 3f67be7..3cbefd3 100644 --- a/src/xml/parsers.rs +++ b/src/xml/parsers.rs @@ -733,6 +733,23 @@ impl<'s> Parser<'s, ETag<'s>> for ETag<'s> { } } +impl<'s> Parser<'s, ContentItem<'s>> for ContentItem<'s> { + fn parse(input: &'s str) -> IResult<&str, ContentItem<'s>> { + alt(( + map(CharData::parse, |char_data| { + ContentItem::CharData(char_data) + }), + map(Element::parse, |element| ContentItem::Element(element)), + map(Reference::parse, |reference| { + ContentItem::Reference(reference) + }), + map(CDSect::parse, |cd_sect| ContentItem::CDSect(cd_sect)), + map(PI::parse, |pi| ContentItem::PI(pi)), + map(Comment::parse, |comment| ContentItem::Comment(comment)), + ))(input) + } +} + /// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* impl<'s> Parser<'s, Content<'s>> for Content<'s> { fn parse(input: &'s str) -> IResult<&str, Content<'s>> {