462 lines
14 KiB
Rust
462 lines
14 KiB
Rust
use std::char;
|
|
|
|
use nom::{
|
|
branch::{alt, permutation},
|
|
bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until},
|
|
character::{
|
|
complete::one_of,
|
|
streaming::{alpha1, char, digit1, none_of, satisfy},
|
|
},
|
|
combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify},
|
|
error::{Error, ErrorKind},
|
|
multi::{many0, many1, many_till},
|
|
sequence::{delimited, pair, preceded, tuple},
|
|
Err, IResult, Parser,
|
|
};
|
|
|
|
// parser: parses tokens from lexer into events
|
|
|
|
enum ContentItem<'s> {
|
|
CharData(&'s str),
|
|
Element(Element<'s>),
|
|
// Reference(Reference<'s>),
|
|
// CDSect(CDSect<'s>),
|
|
}
|
|
|
|
type Content<'s> = Option<Vec<ContentItem<'s>>>;
|
|
|
|
struct Attribute<'s> {
|
|
key: &'s str,
|
|
value: &'s str,
|
|
}
|
|
|
|
/// Contains only latin characters or dash after first char
|
|
type EncName<'s> = &'s str;
|
|
|
|
struct DoctypeDecl<'s> {
|
|
name: &'s str,
|
|
// TODO: doctype declaration parsing
|
|
}
|
|
///
|
|
pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
|
|
todo!()
|
|
}
|
|
|
|
struct Element<'s> {
|
|
name: &'s str,
|
|
attributes: Vec<Attribute<'s>>,
|
|
content: Content<'s>,
|
|
}
|
|
/// Element
|
|
pub fn element(input: &str) -> IResult<&str, Element> {
|
|
todo!()
|
|
}
|
|
|
|
enum Misc<'s> {
|
|
Comment(Comment<'s>),
|
|
PI(PI<'s>),
|
|
}
|
|
/// Misc
|
|
pub fn misc(input: &str) -> IResult<&str, Misc> {
|
|
todo!()
|
|
}
|
|
|
|
type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
|
|
/// [1] document ::= prolog element Misc*
|
|
pub fn document(input: &str) -> IResult<&str, Document> {
|
|
tuple((prolog, element, many0(misc)))(input)
|
|
}
|
|
|
|
type Char = char;
|
|
/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
|
|
pub fn xmlchar(input: &str) -> IResult<&str, Char> {
|
|
satisfy(
|
|
|c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
|
|
)(input)
|
|
}
|
|
|
|
type S<'s> = &'s str;
|
|
/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
|
|
pub fn s(input: &str) -> IResult<&str, S> {
|
|
is_a("\u{20}\u{9}\u{D}\u{A}")(input)
|
|
}
|
|
|
|
type NameStartChar = char;
|
|
/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
|
pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
|
|
satisfy(
|
|
|c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
|
|
)(input)
|
|
}
|
|
|
|
type NameChar = char;
|
|
/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|
|
pub fn name_char(input: &str) -> IResult<&str, NameChar> {
|
|
alt((
|
|
name_start_char,
|
|
satisfy(
|
|
|c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
|
|
),
|
|
))(input)
|
|
}
|
|
|
|
type Name<'s> = &'s str;
|
|
/// [5] Name ::= NameStartChar (NameChar)*
|
|
pub fn name(input: &str) -> IResult<&str, Name> {
|
|
recognize(pair(name_start_char, many0(name_char)))(input)
|
|
}
|
|
|
|
type Names<'s> = &'s str;
|
|
/// [6] Names ::= Name (#x20 Name)*
|
|
pub fn names(input: &str) -> IResult<&str, Names> {
|
|
recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
|
|
}
|
|
|
|
type Nmtoken<'s> = &'s str;
|
|
/// [7] Nmtoken ::= (NameChar)+
|
|
pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
|
|
recognize(many1(name_char))(input)
|
|
}
|
|
|
|
type Nmtokens<'s> = &'s str;
|
|
/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
|
|
pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
|
|
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
|
|
}
|
|
|
|
type EntityValue<'s> = &'s str;
|
|
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
|
|
/// | "'" ([^%&'] | PEReference | Reference)* "'"
|
|
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
|
|
alt((
|
|
delimited(
|
|
char('"'),
|
|
recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
|
|
char('"'),
|
|
),
|
|
delimited(
|
|
char('\''),
|
|
recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
|
|
char('\''),
|
|
),
|
|
))(input)
|
|
}
|
|
|
|
type AttValue<'s> = &'s str;
|
|
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
|
|
/// | "'" ([^<&'] | Reference)* "'"
|
|
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
|
|
alt((
|
|
delimited(
|
|
char('"'),
|
|
recognize(many0(alt((none_of("<&\""), reference)))),
|
|
char('"'),
|
|
),
|
|
delimited(
|
|
char('\''),
|
|
recognize(many0(alt((none_of("<&'"), reference)))),
|
|
char('\''),
|
|
),
|
|
))(input)
|
|
}
|
|
|
|
type SystemLiteral<'s> = &'s str;
|
|
/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
|
|
pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
|
|
alt((
|
|
delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
|
|
delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
|
|
))(input)
|
|
}
|
|
|
|
type PubidLiteral<'s> = &'s str;
|
|
/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
|
|
pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
|
|
alt((
|
|
delimited(char('"'), recognize(many0(pubid_char)), char('"')),
|
|
delimited(
|
|
char('\''),
|
|
recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
|
|
char('\''),
|
|
),
|
|
))(input)
|
|
}
|
|
|
|
type PubidChar<'s> = char;
|
|
/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
|
|
pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
|
|
satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
|
|
input,
|
|
)
|
|
}
|
|
|
|
type CharData<'s> = &'s str;
|
|
/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
|
|
pub fn char_data(input: &str) -> IResult<&str, CharData> {
|
|
recognize(many_till(
|
|
none_of("<&"),
|
|
peek(alt((recognize(one_of("<&")), tag("]]>")))),
|
|
))(input)
|
|
|
|
// let tagg: &str;
|
|
// if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) {
|
|
// if let Ok((_, tagg2)) =
|
|
// peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input)
|
|
// {
|
|
// if tagg1.len() < tagg2.len() {
|
|
// tagg = tagg1
|
|
// } else {
|
|
// tagg = tagg2
|
|
// }
|
|
// } else {
|
|
// tagg = tagg1;
|
|
// }
|
|
// } else {
|
|
// (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)?
|
|
// }
|
|
// tag(tagg)(input)
|
|
|
|
// recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input)
|
|
// recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input)
|
|
// take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input)
|
|
}
|
|
|
|
type Comment<'s> = &'s str;
|
|
/// Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
|
|
pub fn comment(input: &str) -> IResult<&str, Comment> {
|
|
delimited(
|
|
tag("<!--"),
|
|
recognize(many_till(xmlchar, peek(tag("--")))),
|
|
tag("-->"),
|
|
)(input)
|
|
}
|
|
|
|
struct PI<'s> {
|
|
target: &'s str,
|
|
instruction: Option<&'s str>,
|
|
}
|
|
/// [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
|
|
pub fn pi(input: &str) -> IResult<&str, PI> {
|
|
let (rest, (target, instruction)) = delimited(
|
|
tag("<?"),
|
|
pair(
|
|
pi_target,
|
|
opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))),
|
|
),
|
|
tag("?>"),
|
|
)(input)?;
|
|
Ok((
|
|
rest,
|
|
PI {
|
|
target,
|
|
instruction,
|
|
},
|
|
))
|
|
}
|
|
|
|
type PITarget<'s> = &'s str;
|
|
/// [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
|
|
pub fn pi_target(input: &str) -> IResult<&str, PITarget> {
|
|
let (rest, name) = name(input)?;
|
|
if name.to_lowercase() == "xml" {
|
|
return Err(Err::Error(Error {
|
|
input,
|
|
// TODO: check if better error to return
|
|
code: ErrorKind::Tag,
|
|
}));
|
|
} else {
|
|
return Ok((rest, name));
|
|
}
|
|
}
|
|
|
|
type CDSect<'s> = (CDStart<'s>, CData<'s>, CDEnd<'s>);
|
|
/// [18] CDSect ::= CDStart CData CDEnd
|
|
pub fn cd_sect(input: &str) -> IResult<&str, CDSect> {
|
|
tuple((cd_start, cdata, cd_end))(input)
|
|
}
|
|
|
|
type CDStart<'s> = &'s str;
|
|
/// [19] CDStart ::= '<![CDATA['
|
|
pub fn cd_start(input: &str) -> IResult<&str, CDStart> {
|
|
tag("<![CDATA[")(input)
|
|
}
|
|
|
|
type CData<'s> = &'s str;
|
|
/// [20] CData ::= (Char* - (Char* ']]>' Char*))
|
|
pub fn cdata(input: &str) -> IResult<&str, CData> {
|
|
recognize(many_till(xmlchar, peek(tag("]]>"))))(input)
|
|
}
|
|
|
|
type CDEnd<'s> = &'s str;
|
|
/// [21] CDEnd ::= ']]>'
|
|
pub fn cd_end(input: &str) -> IResult<&str, CDEnd> {
|
|
tag("]]>")(input)
|
|
}
|
|
|
|
type Prolog<'s> = (
|
|
Option<XMLDecl>,
|
|
Vec<Misc<'s>>,
|
|
Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
|
|
);
|
|
/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
|
|
pub fn prolog(input: &str) -> IResult<&str, Prolog> {
|
|
tuple((
|
|
opt(xml_decl),
|
|
many0(misc),
|
|
opt(tuple((doctypedecl, many0(misc)))),
|
|
))(input)
|
|
}
|
|
|
|
struct XMLDecl {
|
|
version_info: VersionInfo,
|
|
// encoding_decl: Option<EncodingDecl>,
|
|
// sd_decl: Option<SDDecl>,
|
|
}
|
|
/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
|
|
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
|
|
// (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
|
|
let (leftover, (version_info /* encoding_decl, sd_decl */,)) = delimited(
|
|
tag("<?xml"),
|
|
tuple((version_info /* opt(encoding_decl), opt(sd_decl) */,)),
|
|
tag("?>"),
|
|
)(input)?;
|
|
Ok((
|
|
leftover,
|
|
XMLDecl {
|
|
version_info,
|
|
// encoding_decl,
|
|
// sd_decl,
|
|
},
|
|
))
|
|
}
|
|
|
|
type VersionInfo = VersionNum;
|
|
/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
|
|
pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
|
|
preceded(
|
|
tuple((s, tag("version"), eq)),
|
|
alt((
|
|
delimited(char('\''), version_num, char('\'')),
|
|
delimited(char('"'), version_num, char('"')),
|
|
)),
|
|
)(input)
|
|
}
|
|
|
|
/// [25] Eq ::= S? '=' S?
|
|
pub fn eq(input: &str) -> IResult<&str, &str> {
|
|
recognize(tuple((opt(s), char('='), opt(s))))(input)
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
enum VersionNum {
|
|
One,
|
|
OneDotOne,
|
|
}
|
|
/// [26] VersionNum ::= '1.' [0-9]+
|
|
pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
|
|
preceded(
|
|
tag("1."),
|
|
alt((
|
|
value(VersionNum::One, char('0')),
|
|
value(VersionNum::OneDotOne, char('1')),
|
|
)),
|
|
)(input)
|
|
}
|
|
|
|
pub fn reference(input: &str) -> IResult<&str, char> {
|
|
todo!()
|
|
}
|
|
|
|
pub fn pe_reference(input: &str) -> IResult<&str, char> {
|
|
todo!()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::num::NonZero;
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_char_data() {
|
|
assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi"));
|
|
assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi"));
|
|
assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi"));
|
|
assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi"));
|
|
assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi"));
|
|
assert_eq!(
|
|
Err(Err::Incomplete(nom::Needed::Size(
|
|
NonZero::new(3usize).unwrap()
|
|
))),
|
|
char_data("abcdefghi")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_comment() {
|
|
assert_eq!(Ok(("", "")), comment("<!---->"));
|
|
assert_eq!(Ok(("", "asdf")), comment("<!--asdf-->"));
|
|
assert_eq!(Ok(("", "as-df")), comment("<!--as-df-->"));
|
|
assert_eq!(
|
|
Err(Err::Incomplete(nom::Needed::Size(
|
|
NonZero::new(2usize).unwrap()
|
|
))),
|
|
comment("<!--asdf")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_pi_target() {
|
|
assert_eq!(Ok((" ", "asdf")), pi_target("asdf "));
|
|
assert_eq!(Ok((" ", "xmlasdf")), pi_target("xmlasdf "));
|
|
assert_eq!(
|
|
Err(Err::Error(Error {
|
|
input: "xml ",
|
|
code: ErrorKind::Tag
|
|
})),
|
|
pi_target("xml ")
|
|
);
|
|
assert_eq!(
|
|
Err(Err::Error(Error {
|
|
input: "xMl ",
|
|
code: ErrorKind::Tag
|
|
})),
|
|
pi_target("xMl ")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cd_sect() {
|
|
assert_eq!(
|
|
Ok((
|
|
"",
|
|
("<![CDATA[", "<greeting>Hello, world!</greeting>", "]]>")
|
|
)),
|
|
cd_sect("<![CDATA[<greeting>Hello, world!</greeting>]]>")
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_cd_start() {
|
|
assert_eq!(Ok(("asdf", "<![CDATA[")), cd_start("<![CDATA[asdf"))
|
|
}
|
|
|
|
#[test]
|
|
fn test_cdata() {
|
|
assert_eq!(Ok(("]]>asdf", "asdf")), cdata("asdf]]>asdf"));
|
|
assert_eq!(
|
|
Ok(("]]>asdf", "<![CDATA[asdf")),
|
|
cdata("<![CDATA[asdf]]>asdf")
|
|
);
|
|
assert_eq!(
|
|
Ok(("]]>asdf", "<greeting>Hello, world!</greeting>")),
|
|
cdata("<greeting>Hello, world!</greeting>]]>asdf")
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_cd_end() {
|
|
assert_eq!(Ok(("asdf", "]]>")), cd_end("]]>asdf"))
|
|
}
|
|
}
|