Minor tokenizer improvements

This commit is contained in:
mdecimus 2024-10-16 10:49:01 +02:00
parent 0ee2fe3dee
commit a1dbd566fc
2 changed files with 42 additions and 69 deletions

69
Cargo.lock generated
View file

@ -1066,7 +1066,7 @@ dependencies = [
"dns-update",
"futures",
"hostname 0.4.0",
"hyper 1.4.1",
"hyper 1.5.0",
"idna 1.0.2",
"imagesize",
"imap_proto",
@ -1606,18 +1606,18 @@ dependencies = [
[[package]]
name = "derive_builder"
version = "0.20.1"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b"
checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
dependencies = [
"derive_builder_macro",
]
[[package]]
name = "derive_builder_core"
version = "0.20.1"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38"
checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
dependencies = [
"darling 0.20.10",
"proc-macro2",
@ -1627,9 +1627,9 @@ dependencies = [
[[package]]
name = "derive_builder_macro"
version = "0.20.1"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
dependencies = [
"derive_builder_core",
"syn 2.0.79",
@ -2707,9 +2707,9 @@ checksum = "9994b79e8c1a39b3166c63ae7823bb2b00831e2a96a31399c50fe69df408eaeb"
[[package]]
name = "hyper"
version = "0.14.30"
version = "0.14.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85"
dependencies = [
"bytes",
"futures-channel",
@ -2731,9 +2731,9 @@ dependencies = [
[[package]]
name = "hyper"
version = "1.4.1"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a"
dependencies = [
"bytes",
"futures-channel",
@ -2758,7 +2758,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
dependencies = [
"futures-util",
"http 0.2.12",
"hyper 0.14.30",
"hyper 0.14.31",
"rustls 0.21.12",
"tokio",
"tokio-rustls 0.24.1",
@ -2772,7 +2772,7 @@ checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333"
dependencies = [
"futures-util",
"http 1.1.0",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-util",
"rustls 0.23.14",
"rustls-pki-types",
@ -2788,7 +2788,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
dependencies = [
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-util",
"pin-project-lite",
"tokio",
@ -2806,7 +2806,7 @@ dependencies = [
"futures-util",
"http 1.1.0",
"http-body 1.0.1",
"hyper 1.4.1",
"hyper 1.5.0",
"pin-project-lite",
"socket2",
"tokio",
@ -3241,7 +3241,7 @@ dependencies = [
"futures-util",
"hkdf",
"http-body-util",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-util",
"jmap_proto",
"lz4_flex",
@ -4095,12 +4095,9 @@ dependencies = [
[[package]]
name = "once_cell"
version = "1.20.1"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1"
dependencies = [
"portable-atomic",
]
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "opaque-debug"
@ -4110,9 +4107,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
[[package]]
name = "openssl"
version = "0.10.66"
version = "0.10.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
checksum = "7b8cefcf97f41316955f9294cd61f639bdcfa9f2f230faac6cb896aa8ab64704"
dependencies = [
"bitflags 2.6.0",
"cfg-if",
@ -4142,9 +4139,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.103"
version = "0.9.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741"
dependencies = [
"cc",
"libc",
@ -4656,9 +4653,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.86"
version = "1.0.87"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a"
dependencies = [
"unicode-ident",
]
@ -5115,7 +5112,7 @@ dependencies = [
"h2 0.3.26",
"http 0.2.12",
"http-body 0.4.6",
"hyper 0.14.30",
"hyper 0.14.31",
"hyper-rustls 0.24.2",
"ipnet",
"js-sys",
@ -5159,7 +5156,7 @@ dependencies = [
"http 1.1.0",
"http-body 1.0.1",
"http-body-util",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-rustls 0.27.3",
"hyper-util",
"ipnet",
@ -5396,7 +5393,7 @@ dependencies = [
"hex",
"hmac 0.12.1",
"http 0.2.12",
"hyper 0.14.30",
"hyper 0.14.31",
"hyper-rustls 0.24.2",
"log",
"maybe-async",
@ -6073,7 +6070,7 @@ dependencies = [
"directory",
"form_urlencoded",
"http-body-util",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-util",
"lru-cache",
"mail-auth",
@ -6443,7 +6440,7 @@ dependencies = [
"form_urlencoded",
"futures",
"http-body-util",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-util",
"imap",
"imap_proto",
@ -6744,7 +6741,7 @@ dependencies = [
"http 1.1.0",
"http-body 1.0.1",
"http-body-util",
"hyper 1.4.1",
"hyper 1.5.0",
"hyper-timeout",
"hyper-util",
"percent-encoding",
@ -7132,9 +7129,9 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.10.0"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
dependencies = [
"getrandom",
]
@ -7352,7 +7349,7 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.52.0",
"windows-sys 0.59.0",
]
[[package]]

View file

@ -68,12 +68,7 @@ impl<'x> Iterator for TypesTokenizer<'x> {
}
// Try parsing email
if self.tokenize_emails && token.word.is_email_atom()
/*&& self.peek_has_tokens(
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
TokenType::Space,
)*/
{
if self.tokenize_emails && token.word.is_email_atom() {
self.peek_rewind();
if let Some(email) = self.try_parse_email() {
self.peek_advance();
@ -83,9 +78,7 @@ impl<'x> Iterator for TypesTokenizer<'x> {
}
// Try parsing URL without scheme
if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true)
//&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
{
if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true) {
self.peek_rewind();
if let Some(url) = self.try_parse_url(None) {
self.peek_advance();
@ -247,30 +240,6 @@ impl<'x> TypesTokenizer<'x> {
self.peek_pos = 0;
}
/*fn peek_has_tokens(
&mut self,
tokens: &[TokenType<&'_ str>],
stop_token: impl Fn(&TokenType<&'_ str>) -> bool,
) -> bool {
let mut tokens = tokens.iter().copied();
let mut token = tokens.next().unwrap();
while let Some(t) = self.peek() {
if t.word == token {
if let Some(next_token) = tokens.next() {
token = next_token;
} else {
self.peek_rewind();
return true;
}
} else if stop_token(&t.word) {
break;
}
}
self.peek_rewind();
false
}*/
fn try_parse_url(
&mut self,
scheme_token: Option<Token<TokenType<&'_ str>>>,
@ -498,6 +467,9 @@ impl<'x> TypesTokenizer<'x> {
// Find local part
loop {
let token = self.peek()?;
if token.to - start_token.from > 255 {
return None;
}
match token.word {
word if word.is_email_atom() => {
last_is_dot = false;
@ -585,6 +557,10 @@ impl<'x> TypesTokenizer<'x> {
}
end_pos = token.to;
restore_pos = self.peek_pos;
if end_pos - start_pos > 255 {
return None;
}
}
self.peek_pos = restore_pos;