diff --git a/src/parser.rs b/src/parser.rs index e689a53..882ebae 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,7 +2,7 @@ use std::char; use nom::{ branch::{alt, permutation}, - bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until}, + bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until, take_while}, character::{ complete::one_of, streaming::{alpha1, char, digit1, none_of, satisfy}, @@ -16,6 +16,8 @@ use nom::{ // parser: parses tokens from lexer into events // no well formedness, validity, or data model, simple translation of input into rust types +// output is a rust representation of the input xml +// types could be used for xml production too? enum ContentItem<'s> { CharData(&'s str), @@ -89,37 +91,73 @@ pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) } -type EntityValue<'s> = &'s str; +enum LiteralData<'s> { + String(&'s str), + PEReference(PEReference<'s>), + Reference(Reference<'s>), +} + +type EntityValue<'s> = Vec>; /// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' /// | "'" ([^%&'] | PEReference | Reference)* "'" pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { alt(( delimited( char('"'), - recognize(many0(alt((none_of("%&\""), pe_reference, reference)))), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&\"")))), + |string| LiteralData::String(string), + ), + map(pe_reference, |pe_reference| { + LiteralData::PEReference(pe_reference) + }), + map(reference, |reference| LiteralData::Reference(reference)), + ))), char('"'), ), delimited( char('\''), - recognize(many0(alt((none_of("%&'"), pe_reference, reference)))), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&'")))), + |string| LiteralData::String(string), + ), + map(pe_reference, |pe_reference| { + LiteralData::PEReference(pe_reference) + }), + map(reference, |reference| LiteralData::Reference(reference)), + ))), char('\''), ), ))(input) } -type AttValue<'s> = &'s str; +type AttValue<'s> = Vec>; /// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' /// | "'" ([^<&'] | Reference)* "'" pub fn att_value(input: &str) -> IResult<&str, AttValue> { alt(( delimited( char('"'), - recognize(many0(alt((none_of("<&\""), reference)))), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&\"")))), + |string| LiteralData::String(string), + ), + map(reference, |reference| LiteralData::Reference(reference)), + ))), char('"'), ), delimited( char('\''), - recognize(many0(alt((none_of("<&'"), reference)))), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&'")))), + |string| LiteralData::String(string), + ), + map(reference, |reference| LiteralData::Reference(reference)), + ))), char('\''), ), ))(input) @@ -389,18 +427,18 @@ pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { ))(input) } -enum IntSubsetItem<'s> { +enum IntSubsetDeclaration<'s> { MarkupDecl(MarkupDecl<'s>), DeclSep(DeclSep<'s>), } -type IntSubset<'s> = Vec>; +type IntSubset<'s> = Vec>; /// [28b] intSubset ::= (markupdecl | DeclSep)* pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { many0(alt(( map(markup_decl, |markup_decl| { - IntSubsetItem::MarkupDecl(markup_decl) + IntSubsetDeclaration::MarkupDecl(markup_decl) }), - map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)), + map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)), )))(input) } @@ -432,9 +470,39 @@ pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { ))(input) } +struct ExtSubset<'s> { + text_decl: Option>, + ext_subset_decl: ExtSubsetDecl<'s>, +} /// [30] extSubset ::= TextDecl? extSubsetDecl +pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> { + map( + pair(opt(text_decl), ext_subset_decl), + |(text_decl, ext_subset_decl)| ExtSubset { + text_decl, + ext_subset_decl, + }, + )(input) +} +enum ExtSubsetDeclaration<'s> { + MarkupDecl(MarkupDecl<'s>), + ConditionalSect(ConditionalSect<'s>), + DeclSep(DeclSep<'s>), +} +type ExtSubsetDecl<'s> = Vec>; /// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* +pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> { + many0(alt(( + map(markup_decl, |markup_decl| { + ExtSubsetDeclaration::MarkupDecl(markup_decl) + }), + map(conditional_sect, |conditional_sect| { + ExtSubsetDeclaration::ConditionalSect(conditional_sect) + }), + map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)), + )))(input) +} type SDDecl = bool; /// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) @@ -458,10 +526,9 @@ pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { // (Productions 33 through 38 have been removed.) -struct Element<'s> { - name: &'s str, - attributes: Vec>, - content: Content<'s>, +enum Element<'s> { + Empty(EmptyElemTag<'s>), + NotEmpty(STag<'s>, Content<'s>, ETag<'s>), } /// [39] element ::= EmptyElemTag | STag content ETag pub fn element(input: &str) -> IResult<&str, Element> { @@ -480,10 +547,29 @@ pub fn attribute(input: &str) -> IResult<&str, Attribute> { separated_pair(name, eq, att_value)(input) } -type CharRef<'s> = &'s str; +enum CharRef<'s> { + Decimal(&'s str), + Hexadecimal(&'s str), +} /// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' pub fn char_ref(input: &str) -> IResult<&str, CharRef> { - todo!() + alt(( + delimited( + tag("&#"), + map(take_while(|c| matches!(c, '0'..='9')), |decimal| { + CharRef::Decimal(decimal) + }), + tag(";"), + ), + delimited( + tag("&#x"), + map( + take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )), + |hexadecimal| CharRef::Hexadecimal(hexadecimal), + ), + tag(";"), + ), + ))(input) } enum Reference<'s> { @@ -510,7 +596,86 @@ pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { delimited(tag("%"), name, tag(";"))(input) } -/// TODO: entity declarations +enum EntityDecl<'s> { + GEDecl(GEDecl<'s>), + PEDecl(PEDecl<'s>), +} +/// [70] EntityDecl ::= GEDecl | PEDecl +pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> { + alt(( + map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)), + map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)), + ))(input) +} + +struct GEDecl<'s> { + name: Name<'s>, + entity_def: EntityDef<'s>, +} +/// [71] GEDecl ::= '' +pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> { + map( + delimited( + pair(tag("")), + ), + |(name, entity_def)| GEDecl { name, entity_def }, + )(input) +} + +struct PEDecl<'s> { + name: Name<'s>, + pe_def: PEDef<'s>, +} +/// [72] PEDecl ::= '' +pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> { + map( + delimited( + tuple((tag("")), + ), + |(name, pe_def)| PEDecl { name, pe_def }, + )(input) +} + +enum EntityDef<'s> { + EntityValue(EntityValue<'s>), + ExternalID { + external_id: ExternalID<'s>, + ndata_decl: Option>, + }, +} +/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) +pub fn entity_def(input: &str) -> IResult<&str, EntityDef> { + alt(( + map(entity_value, |entity_value| { + EntityDef::EntityValue(entity_value) + }), + map( + pair(external_id, opt(ndata_decl)), + |(external_id, ndata_decl)| EntityDef::ExternalID { + external_id, + ndata_decl, + }, + ), + ))(input) +} + +enum PEDef<'s> { + EntityValue(EntityValue<'s>), + ExternalID(ExternalID<'s>), +} +/// [74] PEDef ::= EntityValue | ExternalID +pub fn pe_def(input: &str) -> IResult<&str, PEDef> { + alt(( + map(entity_value, |entity_value| { + PEDef::EntityValue(entity_value) + }), + map(external_id, |external_id| PEDef::ExternalID(external_id)), + ))(input) +} enum ExternalID<'s> { SYSTEM { @@ -567,9 +732,12 @@ pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { )(input) } -type extParsedEnt<'s> = (Option>, Content<'s>); +struct ExtParsedEnt<'s> { + text_decl: Option>, + content: Content<'s>, +} /// [78] extParsedEnt ::= TextDecl? content -pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> { +pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> { pair(opt(text_decl), content)(input) }