WIP: extSubset

This commit is contained in:
cel 🌸 2024-06-25 00:18:18 +01:00
parent afda87a8d7
commit 0b11cbbfd8
1 changed files with 188 additions and 20 deletions

View File

@ -2,7 +2,7 @@ use std::char;
use nom::{
branch::{alt, permutation},
bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until},
bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until, take_while},
character::{
complete::one_of,
streaming::{alpha1, char, digit1, none_of, satisfy},
@ -16,6 +16,8 @@ use nom::{
// parser: parses tokens from lexer into events
// no well formedness, validity, or data model, simple translation of input into rust types
// output is a rust representation of the input xml
// types could be used for xml production too?
enum ContentItem<'s> {
CharData(&'s str),
@ -89,37 +91,73 @@ pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
}
type EntityValue<'s> = &'s str;
enum LiteralData<'s> {
String(&'s str),
PEReference(PEReference<'s>),
Reference(Reference<'s>),
}
type EntityValue<'s> = Vec<LiteralData<'s>>;
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
/// | "'" ([^%&'] | PEReference | Reference)* "'"
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
alt((
delimited(
char('"'),
recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&\"")))),
|string| LiteralData::String(string),
),
map(pe_reference, |pe_reference| {
LiteralData::PEReference(pe_reference)
}),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('"'),
),
delimited(
char('\''),
recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&'")))),
|string| LiteralData::String(string),
),
map(pe_reference, |pe_reference| {
LiteralData::PEReference(pe_reference)
}),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('\''),
),
))(input)
}
type AttValue<'s> = &'s str;
type AttValue<'s> = Vec<LiteralData<'s>>;
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
/// | "'" ([^<&'] | Reference)* "'"
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
alt((
delimited(
char('"'),
recognize(many0(alt((none_of("<&\""), reference)))),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&\"")))),
|string| LiteralData::String(string),
),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('"'),
),
delimited(
char('\''),
recognize(many0(alt((none_of("<&'"), reference)))),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&'")))),
|string| LiteralData::String(string),
),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('\''),
),
))(input)
@ -389,18 +427,18 @@ pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> {
))(input)
}
enum IntSubsetItem<'s> {
enum IntSubsetDeclaration<'s> {
MarkupDecl(MarkupDecl<'s>),
DeclSep(DeclSep<'s>),
}
type IntSubset<'s> = Vec<IntSubsetItem<'s>>;
type IntSubset<'s> = Vec<IntSubsetDeclaration<'s>>;
/// [28b] intSubset ::= (markupdecl | DeclSep)*
pub fn int_subset(input: &str) -> IResult<&str, IntSubset> {
many0(alt((
map(markup_decl, |markup_decl| {
IntSubsetItem::MarkupDecl(markup_decl)
IntSubsetDeclaration::MarkupDecl(markup_decl)
}),
map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)),
map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)),
)))(input)
}
@ -432,9 +470,39 @@ pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> {
))(input)
}
struct ExtSubset<'s> {
text_decl: Option<TextDecl<'s>>,
ext_subset_decl: ExtSubsetDecl<'s>,
}
/// [30] extSubset ::= TextDecl? extSubsetDecl
pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> {
map(
pair(opt(text_decl), ext_subset_decl),
|(text_decl, ext_subset_decl)| ExtSubset {
text_decl,
ext_subset_decl,
},
)(input)
}
enum ExtSubsetDeclaration<'s> {
MarkupDecl(MarkupDecl<'s>),
ConditionalSect(ConditionalSect<'s>),
DeclSep(DeclSep<'s>),
}
type ExtSubsetDecl<'s> = Vec<ExtSubsetDeclaration<'s>>;
/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> {
many0(alt((
map(markup_decl, |markup_decl| {
ExtSubsetDeclaration::MarkupDecl(markup_decl)
}),
map(conditional_sect, |conditional_sect| {
ExtSubsetDeclaration::ConditionalSect(conditional_sect)
}),
map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)),
)))(input)
}
type SDDecl = bool;
/// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
@ -458,10 +526,9 @@ pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> {
// (Productions 33 through 38 have been removed.)
struct Element<'s> {
name: &'s str,
attributes: Vec<Attribute<'s>>,
content: Content<'s>,
enum Element<'s> {
Empty(EmptyElemTag<'s>),
NotEmpty(STag<'s>, Content<'s>, ETag<'s>),
}
/// [39] element ::= EmptyElemTag | STag content ETag
pub fn element(input: &str) -> IResult<&str, Element> {
@ -480,10 +547,29 @@ pub fn attribute(input: &str) -> IResult<&str, Attribute> {
separated_pair(name, eq, att_value)(input)
}
type CharRef<'s> = &'s str;
enum CharRef<'s> {
Decimal(&'s str),
Hexadecimal(&'s str),
}
/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
pub fn char_ref(input: &str) -> IResult<&str, CharRef> {
todo!()
alt((
delimited(
tag("&#"),
map(take_while(|c| matches!(c, '0'..='9')), |decimal| {
CharRef::Decimal(decimal)
}),
tag(";"),
),
delimited(
tag("&#x"),
map(
take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )),
|hexadecimal| CharRef::Hexadecimal(hexadecimal),
),
tag(";"),
),
))(input)
}
enum Reference<'s> {
@ -510,7 +596,86 @@ pub fn pe_reference(input: &str) -> IResult<&str, PEReference> {
delimited(tag("%"), name, tag(";"))(input)
}
/// TODO: entity declarations
enum EntityDecl<'s> {
GEDecl(GEDecl<'s>),
PEDecl(PEDecl<'s>),
}
/// [70] EntityDecl ::= GEDecl | PEDecl
pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> {
alt((
map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)),
map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)),
))(input)
}
struct GEDecl<'s> {
name: Name<'s>,
entity_def: EntityDef<'s>,
}
/// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> {
map(
delimited(
pair(tag("<!ENTITY"), s),
separated_pair(name, s, entity_def),
pair(opt(s), tag(">")),
),
|(name, entity_def)| GEDecl { name, entity_def },
)(input)
}
struct PEDecl<'s> {
name: Name<'s>,
pe_def: PEDef<'s>,
}
/// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> {
map(
delimited(
tuple((tag("<!ENTITY"), s, tag("%"), s)),
separated_pair(name, s, pe_def),
pair(opt(s), tag(">")),
),
|(name, pe_def)| PEDecl { name, pe_def },
)(input)
}
enum EntityDef<'s> {
EntityValue(EntityValue<'s>),
ExternalID {
external_id: ExternalID<'s>,
ndata_decl: Option<NDataDecl<'s>>,
},
}
/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
pub fn entity_def(input: &str) -> IResult<&str, EntityDef> {
alt((
map(entity_value, |entity_value| {
EntityDef::EntityValue(entity_value)
}),
map(
pair(external_id, opt(ndata_decl)),
|(external_id, ndata_decl)| EntityDef::ExternalID {
external_id,
ndata_decl,
},
),
))(input)
}
enum PEDef<'s> {
EntityValue(EntityValue<'s>),
ExternalID(ExternalID<'s>),
}
/// [74] PEDef ::= EntityValue | ExternalID
pub fn pe_def(input: &str) -> IResult<&str, PEDef> {
alt((
map(entity_value, |entity_value| {
PEDef::EntityValue(entity_value)
}),
map(external_id, |external_id| PEDef::ExternalID(external_id)),
))(input)
}
enum ExternalID<'s> {
SYSTEM {
@ -567,9 +732,12 @@ pub fn text_decl(input: &str) -> IResult<&str, TextDecl> {
)(input)
}
type extParsedEnt<'s> = (Option<TextDecl<'s>>, Content<'s>);
struct ExtParsedEnt<'s> {
text_decl: Option<TextDecl<'s>>,
content: Content<'s>,
}
/// [78] extParsedEnt ::= TextDecl? content
pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> {
pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> {
pair(opt(text_decl), content)(input)
}