[reader] Avoid handling of unicode when not necessary

Clif files are not meant to be written by end-users anyway. The main
effects are that non-ascii identifiers fail to lex instead of parse and
whitespace must now be in the ascii range. Comments still have full
unicode support.

This also inlines all char::is_* methods to avoid nested matches.

Overall this results in a slight reduction of instruction count.
This commit is contained in:
bjorn3
2020-08-26 11:30:19 +02:00
committed by Andrew Brown
parent 19393be396
commit 0d3f9ad8ef

View File

@@ -179,10 +179,8 @@ impl<'a> Lexer<'a> {
// Starting from `lookahead`, are we looking at a number? // Starting from `lookahead`, are we looking at a number?
fn looking_at_numeric(&self) -> bool { fn looking_at_numeric(&self) -> bool {
if let Some(c) = self.lookahead { if let Some(c) = self.lookahead {
if c.is_digit(10) {
return true;
}
match c { match c {
'0'..='9' => return true,
'-' => return true, '-' => return true,
'+' => return true, '+' => return true,
'.' => return true, '.' => return true,
@@ -291,7 +289,7 @@ impl<'a> Lexer<'a> {
match self.next_ch() { match self.next_ch() {
Some('-') | Some('_') => {} Some('-') | Some('_') => {}
Some('.') => is_float = true, Some('.') => is_float = true,
Some(ch) if ch.is_alphanumeric() => {} Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
_ => break, _ => break,
} }
} }
@@ -309,11 +307,10 @@ impl<'a> Lexer<'a> {
let begin = self.pos; let begin = self.pos;
let loc = self.loc(); let loc = self.loc();
assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_alphabetic()); assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());
loop { loop {
match self.next_ch() { match self.next_ch() {
Some('_') => {} Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
Some(ch) if ch.is_alphanumeric() => {}
_ => break, _ => break,
} }
} }
@@ -398,9 +395,10 @@ impl<'a> Lexer<'a> {
assert_eq!(self.lookahead, Some('%')); assert_eq!(self.lookahead, Some('%'));
while let Some(c) = self.next_ch() { loop {
if !(c.is_ascii() && c.is_alphanumeric() || c == '_') { match self.next_ch() {
break; Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
_ => break,
} }
} }
@@ -490,8 +488,8 @@ impl<'a> Lexer<'a> {
Some(self.scan_number()) Some(self.scan_number())
} }
} }
Some(ch) if ch.is_digit(10) => Some(self.scan_number()), Some('0'..='9') => Some(self.scan_number()),
Some(ch) if ch.is_alphabetic() => { Some('a'..='z') | Some('A'..='Z') => {
if self.looking_at("NaN") || self.looking_at("Inf") { if self.looking_at("NaN") || self.looking_at("Inf") {
Some(self.scan_number()) Some(self.scan_number())
} else { } else {
@@ -502,7 +500,8 @@ impl<'a> Lexer<'a> {
Some('"') => Some(self.scan_string()), Some('"') => Some(self.scan_string()),
Some('#') => Some(self.scan_hex_sequence()), Some('#') => Some(self.scan_hex_sequence()),
Some('@') => Some(self.scan_srcloc()), Some('@') => Some(self.scan_srcloc()),
Some(ch) if ch.is_whitespace() => { // all ascii whitespace
Some(' ') | Some('\x09'..='\x0d') => {
self.next_ch(); self.next_ch();
continue; continue;
} }