peanuts/src/parser.rs

372 lines
11 KiB
Rust
Raw Normal View History

2024-06-12 10:15:48 +01:00
use std::char;
use nom::{
2024-06-14 16:53:47 +01:00
branch::{alt, permutation},
bytes::streaming::{is_a, is_not, tag, take, take_till, take_until},
2024-06-12 10:15:48 +01:00
character::{
complete::one_of,
2024-06-14 16:53:47 +01:00
streaming::{alpha1, char, digit1, none_of, satisfy},
2024-06-12 10:15:48 +01:00
},
2024-06-14 16:53:47 +01:00
combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify},
error::{Error, ErrorKind},
multi::{many0, many1, many_till},
2024-06-12 10:15:48 +01:00
sequence::{delimited, pair, preceded, tuple},
Err, IResult, Parser,
};
2024-03-04 16:14:28 +00:00
// parser: parses tokens from lexer into events
2024-06-12 10:15:48 +01:00
type Comment<'s> = &'s str;
struct PI<'s> {
target: &'s str,
instruction: Option<&'s str>,
}
enum ContentItem<'s> {
CharData(&'s str),
Element(Element<'s>),
// Reference(Reference<'s>),
// CDSect(CDSect<'s>),
2024-06-12 10:15:48 +01:00
}
type Content<'s> = Option<Vec<ContentItem<'s>>>;
struct Attribute<'s> {
key: &'s str,
value: &'s str,
}
/// Contains only latin characters or dash after first char
type EncName<'s> = &'s str;
struct DoctypeDecl<'s> {
name: &'s str,
// TODO: doctype declaration parsing
2024-06-12 10:15:48 +01:00
}
///
2024-06-12 10:15:48 +01:00
pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
todo!()
}
struct Element<'s> {
name: &'s str,
attributes: Vec<Attribute<'s>>,
content: Content<'s>,
}
/// Element
2024-06-12 10:15:48 +01:00
pub fn element(input: &str) -> IResult<&str, Element> {
todo!()
}
enum Misc<'s> {
Comment(Comment<'s>),
PI(PI<'s>),
}
/// Misc
2024-06-12 10:15:48 +01:00
pub fn misc(input: &str) -> IResult<&str, Misc> {
todo!()
}
type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
/// [1] document ::= prolog element Misc*
pub fn document(input: &str) -> IResult<&str, Document> {
tuple((prolog, element, many0(misc)))(input)
}
type Char = char;
/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
pub fn xmlchar(input: &str) -> IResult<&str, Char> {
satisfy(
|c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
)(input)
}
type S<'s> = &'s str;
/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
pub fn s(input: &str) -> IResult<&str, S> {
is_a("\u{20}\u{9}\u{D}\u{A}")(input)
}
type NameStartChar = char;
/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
satisfy(
|c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
)(input)
}
type NameChar = char;
/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
pub fn name_char(input: &str) -> IResult<&str, NameChar> {
alt((
name_start_char,
satisfy(
|c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
),
))(input)
}
type Name<'s> = &'s str;
/// [5] Name ::= NameStartChar (NameChar)*
pub fn name(input: &str) -> IResult<&str, Name> {
recognize(pair(name_start_char, many0(name_char)))(input)
}
type Names<'s> = &'s str;
/// [6] Names ::= Name (#x20 Name)*
pub fn names(input: &str) -> IResult<&str, Names> {
recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
}
type Nmtoken<'s> = &'s str;
/// [7] Nmtoken ::= (NameChar)+
pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
recognize(many1(name_char))(input)
}
type Nmtokens<'s> = &'s str;
/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
}
type EntityValue<'s> = &'s str;
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
/// | "'" ([^%&'] | PEReference | Reference)* "'"
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
alt((
delimited(
char('"'),
recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
char('"'),
),
delimited(
char('\''),
recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
char('\''),
),
))(input)
}
type AttValue<'s> = &'s str;
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
/// | "'" ([^<&'] | Reference)* "'"
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
alt((
delimited(
char('"'),
recognize(many0(alt((none_of("<&\""), reference)))),
char('"'),
),
delimited(
char('\''),
recognize(many0(alt((none_of("<&'"), reference)))),
char('\''),
),
))(input)
}
type SystemLiteral<'s> = &'s str;
/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
alt((
delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
))(input)
}
type PubidLiteral<'s> = &'s str;
/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
alt((
delimited(char('"'), recognize(many0(pubid_char)), char('"')),
delimited(
char('\''),
recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
char('\''),
),
))(input)
}
type PubidChar<'s> = char;
/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
input,
)
}
2024-06-14 16:53:47 +01:00
// TODO: wtf why doesn't this work how do i do thisjj
2024-06-12 10:15:48 +01:00
type CharData<'s> = &'s str;
/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
pub fn char_data(input: &str) -> IResult<&str, CharData> {
2024-06-14 16:53:47 +01:00
// tag(map(
// peek(alt((
// map_parser(
// peek(take_until("]]>")),
// nom::bytes::complete::take_till(|c| c == '<' || c == '&'),
// ),
// map_parser(
// peek(take_till(|c| c == '<' || c == '&')),
// nom::bytes::complete::take_until("]]>"),
// ),
// ))),
// |(first, _)| first,
// ))(input)
// map(
// tuple((is_not("<&]"), peek(alt((tag("<"), tag("&"), tag("]]>")))))),
// |(first, _)| first,
// )(input)
// map(
// tuple((recognize(many0(none_of("<&"))), opt(peek(tag("]]>"))))),
// |(first, _)| first,
// )(input)
// alt((recognize(many0(none_of("<&"))), take_until("]]>")))(input)
let tagg: &str;
if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) {
if let Ok((_, tagg2)) =
peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input)
{
if tagg1.len() < tagg2.len() {
tagg = tagg1
} else {
tagg = tagg2
}
} else {
tagg = tagg1;
}
} else {
(_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)?
}
tag(tagg)(input)
// let mut len = 0;
// let ch = input.chars().collect::<Vec<_>>();
// for (idx, char) in ch.as_ref().into_iter().enumerate() {
// match char {
// '<' | '&' => break,
// ']' => {
// if idx <= ch.len() - 3 {}
// },
// _ => todo!(),
// }
// }
// while let Some(char) = chars.next() {
// if char == '<' || char == '&' {
// break;
// } else if char == ']' {
// if let Some(next) = chars.peek() {
// if next == ']' {
// if let Some(next) = chars.next_if_eq() {}
// }
// }
// }
// len += 1;
// }
// todo!()
// recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input)
// recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input)
// take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input)
2024-06-12 10:15:48 +01:00
}
type Prolog<'s> = (
Option<XMLDecl>,
Vec<Misc<'s>>,
Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
);
/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
pub fn prolog(input: &str) -> IResult<&str, Prolog> {
tuple((
opt(xml_decl),
many0(misc),
opt(tuple((doctypedecl, many0(misc)))),
))(input)
}
struct XMLDecl {
version_info: VersionInfo,
// encoding_decl: Option<EncodingDecl>,
// sd_decl: Option<SDDecl>,
2024-06-12 10:15:48 +01:00
}
/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
// (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
let (leftover, (version_info /* encoding_decl, sd_decl */,)) = delimited(
2024-06-12 10:15:48 +01:00
tag("<?xml"),
tuple((version_info /* opt(encoding_decl), opt(sd_decl) */,)),
2024-06-12 10:15:48 +01:00
tag("?>"),
)(input)?;
Ok((
leftover,
XMLDecl {
version_info,
// encoding_decl,
// sd_decl,
2024-06-12 10:15:48 +01:00
},
))
}
type VersionInfo = VersionNum;
/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
preceded(
tuple((s, tag("version"), eq)),
alt((
delimited(char('\''), version_num, char('\'')),
delimited(char('"'), version_num, char('"')),
)),
)(input)
}
/// [25] Eq ::= S? '=' S?
pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> {
tuple((opt(s), char('='), opt(s)))(input)
}
#[derive(Clone)]
enum VersionNum {
One,
OneDotOne,
}
/// [26] VersionNum ::= '1.' [0-9]+
pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
preceded(
tag("1."),
alt((
value(VersionNum::One, char('0')),
value(VersionNum::OneDotOne, char('1')),
)),
)(input)
}
pub fn reference(input: &str) -> IResult<&str, char> {
todo!()
}
pub fn pe_reference(input: &str) -> IResult<&str, char> {
todo!()
}
#[cfg(test)]
mod tests {
2024-06-14 16:53:47 +01:00
use std::num::NonZero;
use super::*;
#[test]
fn test_char_data() {
assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi"));
assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi"));
assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi"));
2024-06-14 16:53:47 +01:00
assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi"));
assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi"));
assert_eq!(
Err(Err::Incomplete(nom::Needed::Size(
NonZero::new(1usize).unwrap()
))),
char_data("abcdefghi")
);
}
}