summaryrefslogtreecommitdiff
path: root/src/token/tokenise.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/token/tokenise.rs')
-rw-r--r--src/token/tokenise.rs131
1 files changed, 131 insertions, 0 deletions
diff --git a/src/token/tokenise.rs b/src/token/tokenise.rs
new file mode 100644
index 0000000..44ce683
--- /dev/null
+++ b/src/token/tokenise.rs
@@ -0,0 +1,131 @@
+/*
+ Copyright 2023 Gabriel Jensen.
+
+ This file is part of aas.
+
+ aas is free software: you can redistribute it
+ and/or modify it under the terms of the GNU
+ General Public License as published by the Free
+ Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ aas is distributed in the hope that it will
+ be useful, but WITHOUT ANY WARRANTY; without
+ even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU
+ General Public License along with aas. If not,
+ see <https://www.gnu.org/licenses/>.
+*/
+
+use crate::is_valid_character;
+use crate::token::Token;
+
+impl Token {
+ #[must_use]
+ pub fn tokenise(input: &str) -> Result<Vec<Self>, String> {
+ let mut tokens: Vec<Self> = Vec::new();
+
+ let mut input_index: usize = 0x0;
+ while let Some(token) = get_next_token(&input, &mut input_index)? { tokens.push(token) }
+
+ return Ok(tokens);
+ }
+}
+
+#[must_use]
+fn get_next_token(input: &str, index: &mut usize) -> Result<Option<Token>, String> {
+ use Token::*;
+
+ let mut word = String::new();
+
+ let mut in_comment = false;
+ let mut in_string = false;
+
+ for c in input.chars().skip(*index) {
+ // Skip until we're out of the comment.
+ if in_comment {
+ if c != '\n' {
+ *index += 0x1;
+ continue;
+ }
+
+ in_comment = false;
+ }
+
+ // Finish the string (if inside one) and return.
+ if in_string {
+ *index += 0x1;
+
+ if c != '"' {
+ word.push(c);
+ continue;
+ }
+
+ return Ok(Some(StringLiteral(word)));
+ }
+
+ // We don't care about invalid character inside of
+ // comments or strings.
+ if !is_valid_character(c) { return Err(format!("invalid character U+{:04X} '{c}' ({index} / {})", c as u32, input.len())) };
+
+ // Check if the word is terminated. If it was, we
+ // don't count this character.
+ if !word.is_empty() {
+ match c {
+ | ' '
+ | '\t'
+ | '\n'
+ | '.'
+ | ','
+ | ':'
+ | ';'
+ | '@'
+ => return Ok(Some(Word(word))),
+
+ _ => {},
+ };
+ }
+
+ // There aren't any more things to complete
+ // (comments, strings, or words), so we know now
+ // that no more characters will be skipped.
+ *index += 0x1;
+
+ match c {
+ | ' '
+ | '\t'
+ => continue,
+
+ '\n' => return Ok(Some(Return)),
+ '[' => return Ok(Some(BracketLeft)),
+ ']' => return Ok(Some(BracketRight)),
+ '.' => return Ok(Some(Fullstop)),
+ ',' => return Ok(Some(Comma)),
+ ':' => return Ok(Some(Colon)),
+ '#' => return Ok(Some(Hashtag)),
+
+ | ';'
+ | '@'
+ => {
+ in_comment = true;
+ continue;
+ },
+
+ '"' => {
+ in_string = true;
+ continue;
+ }
+
+ _ => {},
+ };
+
+ word.push(c);
+ }
+
+ if in_string { return Err("unterminated string".to_string()) };
+
+ return Ok(None);
+}