From afda87a8d7f347b0c4d34aa798f041d05b41bff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?cel=20=F0=9F=8C=B8?= Date: Mon, 24 Jun 2024 18:02:21 +0100 Subject: [PATCH] WIP: dtd garbo --- src/parser.rs | 282 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 244 insertions(+), 38 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index d049c5c..e689a53 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,11 +10,12 @@ use nom::{ combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify}, error::{Error, ErrorKind}, multi::{many0, many1, many_till}, - sequence::{delimited, pair, preceded, tuple}, + sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}, Err, IResult, Parser, }; // parser: parses tokens from lexer into events +// no well formedness, validity, or data model, simple translation of input into rust types enum ContentItem<'s> { CharData(&'s str), @@ -25,15 +26,6 @@ enum ContentItem<'s> { type Content<'s> = Option>>; -struct DoctypeDecl<'s> { - name: &'s str, - // TODO: doctype declaration parsing -} -/// -pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { - todo!() -} - type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); /// [1] document ::= prolog element Misc* pub fn document(input: &str) -> IResult<&str, Document> { @@ -211,21 +203,20 @@ struct PI<'s> { } /// [16] PI ::= '' Char*)))? '?>' pub fn pi(input: &str) -> IResult<&str, PI> { - let (rest, (target, instruction)) = delimited( - tag("")))))), + map( + delimited( + tag("")))))), + ), + tag("?>"), ), - tag("?>"), - )(input)?; - Ok(( - rest, - PI { + |(target, instruction)| PI { target, instruction, }, - )) + )(input) } type PITarget<'s> = &'s str; @@ -288,21 +279,18 @@ struct XMLDecl<'s> { } /// [23] XMLDecl ::= '' pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { - // (VersionInfo, Option, Option) - let (leftover, (version_info, encoding_decl, sd_decl)) = delimited( - tag("")), - )(input)?; - // TODO: change to map - Ok(( - leftover, - XMLDecl { + map( + delimited( + tag("")), + ), + |(version_info, encoding_decl, sd_decl)| XMLDecl { version_info, encoding_decl, sd_decl, }, - )) + )(input) } type VersionInfo = VersionNum; @@ -342,6 +330,7 @@ pub fn version_num(input: &str) -> IResult<&str, VersionNum> { enum Misc<'s> { Comment(Comment<'s>), PI(PI<'s>), + // TODO: how to deal with whitespace S, } /// [27] Misc ::= Comment | PI | S @@ -353,6 +342,100 @@ pub fn misc(input: &str) -> IResult<&str, Misc> { ))(input) } +struct DoctypeDecl<'s> { + name: &'s str, + external_id: Option>, + int_subset: Option>, +} +/// [28] doctypedecl ::= '' +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { + map( + delimited( + pair(tag(""), + ), + |(name, external_id, int_subset)| DoctypeDecl { + name, + external_id, + int_subset, + }, + )(input) +} + +#[derive(Clone)] +enum DeclSep<'s> { + PEReference(PEReference<'s>), + // TODO: tackle whitespace + S, +} +/// [28a] DeclSep ::= PEReference | S +pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { + alt(( + map(pe_reference, |pe_reference| { + DeclSep::PEReference(pe_reference) + }), + value(DeclSep::S, s), + ))(input) +} + +enum IntSubsetItem<'s> { + MarkupDecl(MarkupDecl<'s>), + DeclSep(DeclSep<'s>), +} +type IntSubset<'s> = Vec>; +/// [28b] intSubset ::= (markupdecl | DeclSep)* +pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { + many0(alt(( + map(markup_decl, |markup_decl| { + IntSubsetItem::MarkupDecl(markup_decl) + }), + map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)), + )))(input) +} + +enum MarkupDecl<'s> { + ElementDecl(ElementDecl<'s>), + AttlistDecl(AttlistDecl<'s>), + EntityDecl(EntityDecl<'s>), + NotationDecl(NotationDecl<'s>), + PI(PI<'s>), + Comment(Comment<'s>), +} +/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment +pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { + alt(( + map(element_decl, |element_decl| { + MarkupDecl::ElementDecl(element_decl) + }), + map(attlist_decl, |attlist_decl| { + MarkupDecl::AttlistDecl(attlist_decl) + }), + map(entity_decl, |entity_decl| { + MarkupDecl::EntityDecl(entity_decl) + }), + map(notation_decl, |notation_decl| { + MarkupDecl::NotationDecl(notation_decl) + }), + map(pi, |pi| MarkupDecl::PI(pi)), + map(comment, |comment| MarkupDecl::Comment(comment)), + ))(input) +} + +/// [30] extSubset ::= TextDecl? extSubsetDecl + +/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* + type SDDecl = bool; /// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { @@ -388,18 +471,106 @@ pub fn element(input: &str) -> IResult<&str, Element> { ))(input) } -let +// let STag<'s> = (Name<'s>, ); /// [40] STag ::= '<' Name (S Attribute)* S? '>' -type Attribute<'s> = (&'s str, &'s str) -/// [41] Attribute ::= Name Eq AttValue +type Attribute<'s> = (Name<'s>, AttValue<'s>); +/// [41] Attribute ::= Name Eq AttValue +pub fn attribute(input: &str) -> IResult<&str, Attribute> { + separated_pair(name, eq, att_value)(input) +} -pub fn reference(input: &str) -> IResult<&str, char> { +type CharRef<'s> = &'s str; +/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' +pub fn char_ref(input: &str) -> IResult<&str, CharRef> { todo!() } -pub fn pe_reference(input: &str) -> IResult<&str, char> { - todo!() +enum Reference<'s> { + EntityRef(EntityRef<'s>), + CharRef(CharRef<'s>), +} +/// [67] Reference ::= EntityRef | CharRef +pub fn reference(input: &str) -> IResult<&str, Reference> { + alt(( + map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)), + map(char_ref, |char_ref| Reference::CharRef(char_ref)), + ))(input) +} + +type EntityRef<'s> = &'s str; +/// [68] EntityRef ::= '&' Name ';' +pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> { + delimited(tag("&"), name, tag(";"))(input) +} + +type PEReference<'s> = &'s str; +/// [69] PEReference ::= '%' Name ';' +pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { + delimited(tag("%"), name, tag(";"))(input) +} + +/// TODO: entity declarations + +enum ExternalID<'s> { + SYSTEM { + system_identifier: &'s str, + }, + PUBLIC { + public_identifier: &'s str, + system_identifier: &'s str, + }, +} +/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral +// pub fn external_id(input: &str) -> IResult<&str, ExternalID> { +pub fn external_id(input: &str) -> IResult<&str, ExternalID> { + alt(( + map( + preceded(pair(tag("SYSTEM"), s), system_literal), + |system_identifier| ExternalID::SYSTEM { system_identifier }, + ), + map( + preceded( + pair(tag("PUBLIC"), s), + separated_pair(pubid_literal, s, system_literal), + ), + |(public_identifier, system_identifier)| ExternalID::PUBLIC { + public_identifier, + system_identifier, + }, + ), + ))(input) +} + +type NDataDecl<'s> = &'s str; +/// [76] NDataDecl ::= S 'NDATA' S Name +pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> { + preceded(tuple((s, tag("NDATA"), s)), name)(input) +} + +struct TextDecl<'s> { + version_info: Option, + encoding_decl: EncodingDecl<'s>, +} +/// [77] TextDecl ::= '' +pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { + map( + delimited( + tag(""), + ), + |(version_info, encoding_decl)| TextDecl { + version_info, + encoding_decl, + }, + )(input) +} + +type extParsedEnt<'s> = (Option>, Content<'s>); +/// [78] extParsedEnt ::= TextDecl? content +pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> { + pair(opt(text_decl), content)(input) } type EncodingDecl<'s> = EncName<'s>; @@ -425,6 +596,41 @@ pub fn enc_name(input: &str) -> IResult<&str, EncName> { ))(input) } +struct NotationDecl<'s> { + name: &'s str, + id: NotationDeclID<'s>, +} +enum NotationDeclID<'s> { + External(ExternalID<'s>), + Public(PublicID<'s>), +} +/// [82] NotationDecl ::= '' +pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> { + map( + delimited( + pair(tag("")), + ), + |(name, id)| NotationDecl { name, id }, + )(input) +} + +type PublicID<'s> = &'s str; +/// [83] PublicID ::= 'PUBLIC' S PubidLiteral +pub fn public_id(input: &str) -> IResult<&str, PublicID> { + preceded(pair(tag("PUBLIC"), s), pubid_literal)(input) +} + #[cfg(test)] mod tests { use std::num::NonZero;