WIP: parsers
This commit is contained in:
parent
844f3a5d11
commit
a92aee921d
11
src/event.rs
11
src/event.rs
|
@ -1 +1,12 @@
|
|||
// tags, declaration, comments, text. individual bits and what they contain, e.g. tag contains attributes and namespace declarations, lang, ONLY within the tag
|
||||
|
||||
pub enum Event<'s> {
|
||||
StartTag(Vec<Event<'s>>),
|
||||
EmptyTag(Vec<Event>),
|
||||
Attribute(())
|
||||
CData(&'s str),
|
||||
Comment(&'s str),
|
||||
Declaration(Vec<Attribute<'s>>),
|
||||
Attribute((&'str))
|
||||
EndTag,
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
mod element;
|
||||
mod error;
|
||||
mod parser;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
|
|
283
src/parser.rs
283
src/parser.rs
|
@ -1 +1,284 @@
|
|||
use std::char;
|
||||
|
||||
use nom::{
|
||||
branch::alt,
|
||||
bytes::{
|
||||
complete::take_until,
|
||||
streaming::{is_a, tag, take},
|
||||
},
|
||||
character::{
|
||||
complete::one_of,
|
||||
streaming::{char, digit1, none_of, satisfy},
|
||||
},
|
||||
combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify},
|
||||
error::ErrorKind,
|
||||
multi::{many0, many1},
|
||||
sequence::{delimited, pair, preceded, tuple},
|
||||
Err, IResult, Parser,
|
||||
};
|
||||
|
||||
// parser: parses tokens from lexer into events
|
||||
|
||||
enum Misc<'s> {
|
||||
Comment(Comment<'s>),
|
||||
PI(PI<'s>),
|
||||
}
|
||||
|
||||
type Comment<'s> = &'s str;
|
||||
|
||||
struct PI<'s> {
|
||||
target: &'s str,
|
||||
instruction: Option<&'s str>,
|
||||
}
|
||||
|
||||
enum ContentItem<'s> {
|
||||
CharData(&'s str),
|
||||
Element(Element<'s>),
|
||||
Reference(Reference<'s>),
|
||||
CDSect(CDSect<'s>),
|
||||
}
|
||||
|
||||
type Content<'s> = Option<Vec<ContentItem<'s>>>;
|
||||
|
||||
struct Element<'s> {
|
||||
name: &'s str,
|
||||
attributes: Vec<Attribute<'s>>,
|
||||
content: Content<'s>,
|
||||
}
|
||||
|
||||
struct Attribute<'s> {
|
||||
key: &'s str,
|
||||
value: &'s str,
|
||||
}
|
||||
|
||||
// type VersionNum<'s> = &'s str;
|
||||
/// Contains only latin characters or dash after first char
|
||||
type EncName<'s> = &'s str;
|
||||
|
||||
// struct XMLDecl<'s> {
|
||||
// version_info: VersionNum<'s>,
|
||||
// encoding_decl: Option<EncName<'s>>,
|
||||
// sd_decl: Option<bool>,
|
||||
// }
|
||||
|
||||
struct DoctypeDecl<'s> {
|
||||
name: &'s str,
|
||||
// TODO
|
||||
}
|
||||
|
||||
pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn element(input: &str) -> IResult<&str, Element> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn misc(input: &str) -> IResult<&str, Misc> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
|
||||
/// [1] document ::= prolog element Misc*
|
||||
pub fn document(input: &str) -> IResult<&str, Document> {
|
||||
tuple((prolog, element, many0(misc)))(input)
|
||||
}
|
||||
|
||||
type Char = char;
|
||||
/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
|
||||
pub fn xmlchar(input: &str) -> IResult<&str, Char> {
|
||||
satisfy(
|
||||
|c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
|
||||
)(input)
|
||||
}
|
||||
|
||||
type S<'s> = &'s str;
|
||||
/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
|
||||
pub fn s(input: &str) -> IResult<&str, S> {
|
||||
is_a("\u{20}\u{9}\u{D}\u{A}")(input)
|
||||
}
|
||||
|
||||
type NameStartChar = char;
|
||||
/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
||||
pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
|
||||
satisfy(
|
||||
|c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
|
||||
)(input)
|
||||
}
|
||||
|
||||
type NameChar = char;
|
||||
/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|
||||
pub fn name_char(input: &str) -> IResult<&str, NameChar> {
|
||||
alt((
|
||||
name_start_char,
|
||||
satisfy(
|
||||
|c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
|
||||
),
|
||||
))(input)
|
||||
}
|
||||
|
||||
type Name<'s> = &'s str;
|
||||
/// [5] Name ::= NameStartChar (NameChar)*
|
||||
pub fn name(input: &str) -> IResult<&str, Name> {
|
||||
recognize(pair(name_start_char, many0(name_char)))(input)
|
||||
}
|
||||
|
||||
type Names<'s> = &'s str;
|
||||
/// [6] Names ::= Name (#x20 Name)*
|
||||
pub fn names(input: &str) -> IResult<&str, Names> {
|
||||
recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
|
||||
}
|
||||
|
||||
type Nmtoken<'s> = &'s str;
|
||||
/// [7] Nmtoken ::= (NameChar)+
|
||||
pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
|
||||
recognize(many1(name_char))(input)
|
||||
}
|
||||
|
||||
type Nmtokens<'s> = &'s str;
|
||||
/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
|
||||
pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
|
||||
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
|
||||
}
|
||||
|
||||
type EntityValue<'s> = &'s str;
|
||||
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
|
||||
/// | "'" ([^%&'] | PEReference | Reference)* "'"
|
||||
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
|
||||
alt((
|
||||
delimited(
|
||||
char('"'),
|
||||
recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
|
||||
char('"'),
|
||||
),
|
||||
delimited(
|
||||
char('\''),
|
||||
recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
|
||||
char('\''),
|
||||
),
|
||||
))(input)
|
||||
}
|
||||
|
||||
type AttValue<'s> = &'s str;
|
||||
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
|
||||
/// | "'" ([^<&'] | Reference)* "'"
|
||||
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
|
||||
alt((
|
||||
delimited(
|
||||
char('"'),
|
||||
recognize(many0(alt((none_of("<&\""), reference)))),
|
||||
char('"'),
|
||||
),
|
||||
delimited(
|
||||
char('\''),
|
||||
recognize(many0(alt((none_of("<&'"), reference)))),
|
||||
char('\''),
|
||||
),
|
||||
))(input)
|
||||
}
|
||||
|
||||
type SystemLiteral<'s> = &'s str;
|
||||
/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
|
||||
pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
|
||||
alt((
|
||||
delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
|
||||
delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
|
||||
))(input)
|
||||
}
|
||||
|
||||
type PubidLiteral<'s> = &'s str;
|
||||
/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
|
||||
pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
|
||||
alt((
|
||||
delimited(char('"'), recognize(many0(pubid_char)), char('"')),
|
||||
delimited(
|
||||
char('\''),
|
||||
recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
|
||||
char('\''),
|
||||
),
|
||||
))(input)
|
||||
}
|
||||
|
||||
type PubidChar<'s> = char;
|
||||
/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
|
||||
pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
|
||||
satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
|
||||
input,
|
||||
)
|
||||
}
|
||||
|
||||
type CharData<'s> = &'s str;
|
||||
/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
|
||||
pub fn char_data(input: &str) -> IResult<&str, CharData> {
|
||||
take_until()(input)
|
||||
}
|
||||
|
||||
type Prolog<'s> = (
|
||||
Option<XMLDecl>,
|
||||
Vec<Misc<'s>>,
|
||||
Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
|
||||
);
|
||||
/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
|
||||
pub fn prolog(input: &str) -> IResult<&str, Prolog> {
|
||||
tuple((
|
||||
opt(xml_decl),
|
||||
many0(misc),
|
||||
opt(tuple((doctypedecl, many0(misc)))),
|
||||
))(input)
|
||||
}
|
||||
|
||||
struct XMLDecl {
|
||||
version_info: VersionInfo,
|
||||
encoding_decl: Option<EncodingDecl>,
|
||||
sd_decl: Option<SDDecl>,
|
||||
}
|
||||
/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
|
||||
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
|
||||
// (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
|
||||
let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
|
||||
tag("<?xml"),
|
||||
tuple((version_info, opt(encoding_decl), opt(sd_decl))),
|
||||
tag("?>"),
|
||||
)(input)?;
|
||||
Ok((
|
||||
leftover,
|
||||
XMLDecl {
|
||||
version_info,
|
||||
encoding_decl,
|
||||
sd_decl,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type VersionInfo = VersionNum;
|
||||
/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
|
||||
pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
|
||||
preceded(
|
||||
tuple((s, tag("version"), eq)),
|
||||
alt((
|
||||
delimited(char('\''), version_num, char('\'')),
|
||||
delimited(char('"'), version_num, char('"')),
|
||||
)),
|
||||
)(input)
|
||||
}
|
||||
|
||||
/// [25] Eq ::= S? '=' S?
|
||||
pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> {
|
||||
tuple((opt(s), char('='), opt(s)))(input)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum VersionNum {
|
||||
One,
|
||||
OneDotOne,
|
||||
}
|
||||
/// [26] VersionNum ::= '1.' [0-9]+
|
||||
pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
|
||||
preceded(
|
||||
tag("1."),
|
||||
alt((
|
||||
value(VersionNum::One, char('0')),
|
||||
value(VersionNum::OneDotOne, char('1')),
|
||||
)),
|
||||
)(input)
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use futures::Stream;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncBufRead;
|
||||
|
||||
use crate::{
|
||||
element::{Element, Name, Namespace},
|
||||
|
@ -14,13 +14,19 @@ pub struct Reader<R> {
|
|||
namespaces: Vec<(usize, Namespace)>,
|
||||
}
|
||||
|
||||
impl<R: AsyncRead> Reader<R> {
|
||||
pub async fn read(&self) -> Result<impl From<Element>, Error> {}
|
||||
impl<R> Reader<R>
|
||||
where
|
||||
R: AsyncBufRead,
|
||||
{
|
||||
pub async fn read(&self) -> Result<impl From<Element>, Error> {
|
||||
let buf = self.stream.poll_fill_buf().await?;
|
||||
todo!()
|
||||
}
|
||||
pub async fn read_start(&self) -> Result<impl From<Element>, Error> {}
|
||||
pub async fn read_end(&self) -> Result<(), Error> {}
|
||||
}
|
||||
|
||||
impl<R: AsyncRead> Stream for Reader<R> {
|
||||
impl<R: AsyncBufRead> Stream for Reader<R> {
|
||||
type Item = impl From<Element>;
|
||||
|
||||
async fn poll_next(
|
||||
|
|
Loading…
Reference in New Issue