diff options
Diffstat (limited to 'src/token')
-rw-r--r-- | src/token/tokenise.rs | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/src/token/tokenise.rs b/src/token/tokenise.rs new file mode 100644 index 0000000..44ce683 --- /dev/null +++ b/src/token/tokenise.rs @@ -0,0 +1,131 @@ +/* + Copyright 2023 Gabriel Jensen. + + This file is part of aas. + + aas is free software: you can redistribute it + and/or modify it under the terms of the GNU + General Public License as published by the Free + Software Foundation, either version 3 of the + License, or (at your option) any later version. + + aas is distributed in the hope that it will + be useful, but WITHOUT ANY WARRANTY; without + even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU + General Public License along with aas. If not, + see <https://www.gnu.org/licenses/>. +*/ + +use crate::is_valid_character; +use crate::token::Token; + +impl Token { + #[must_use] + pub fn tokenise(input: &str) -> Result<Vec<Self>, String> { + let mut tokens: Vec<Self> = Vec::new(); + + let mut input_index: usize = 0x0; + while let Some(token) = get_next_token(&input, &mut input_index)? { tokens.push(token) } + + return Ok(tokens); + } +} + +#[must_use] +fn get_next_token(input: &str, index: &mut usize) -> Result<Option<Token>, String> { + use Token::*; + + let mut word = String::new(); + + let mut in_comment = false; + let mut in_string = false; + + for c in input.chars().skip(*index) { + // Skip until we're out of the comment. + if in_comment { + if c != '\n' { + *index += 0x1; + continue; + } + + in_comment = false; + } + + // Finish the string (if inside one) and return. + if in_string { + *index += 0x1; + + if c != '"' { + word.push(c); + continue; + } + + return Ok(Some(StringLiteral(word))); + } + + // We don't care about invalid character inside of + // comments or strings. + if !is_valid_character(c) { return Err(format!("invalid character U+{:04X} '{c}' ({index} / {})", c as u32, input.len())) }; + + // Check if the word is terminated. If it was, we + // don't count this character. + if !word.is_empty() { + match c { + | ' ' + | '\t' + | '\n' + | '.' + | ',' + | ':' + | ';' + | '@' + => return Ok(Some(Word(word))), + + _ => {}, + }; + } + + // There aren't any more things to complete + // (comments, strings, or words), so we know now + // that no more characters will be skipped. + *index += 0x1; + + match c { + | ' ' + | '\t' + => continue, + + '\n' => return Ok(Some(Return)), + '[' => return Ok(Some(BracketLeft)), + ']' => return Ok(Some(BracketRight)), + '.' => return Ok(Some(Fullstop)), + ',' => return Ok(Some(Comma)), + ':' => return Ok(Some(Colon)), + '#' => return Ok(Some(Hashtag)), + + | ';' + | '@' + => { + in_comment = true; + continue; + }, + + '"' => { + in_string = true; + continue; + } + + _ => {}, + }; + + word.push(c); + } + + if in_string { return Err("unterminated string".to_string()) }; + + return Ok(None); +} |