summaryrefslogtreecommitdiff
path: root/src/token
diff options
context:
space:
mode:
Diffstat (limited to 'src/token')
-rw-r--r--src/token/tokenise.rs185
1 files changed, 110 insertions, 75 deletions
diff --git a/src/token/tokenise.rs b/src/token/tokenise.rs
index e713baa..743051b 100644
--- a/src/token/tokenise.rs
+++ b/src/token/tokenise.rs
@@ -1,131 +1,166 @@
/*
- Copyright 2023 Gabriel Bjørnager Jensen.
+ Copyright 2023-2024 Gabriel Bjørnager Jensen.
- This file is part of AAS.
+ This file is part of eAS.
- AAS is free software: you can redistribute it
+ eAS is free software: you can redistribute it
and/or modify it under the terms of the GNU
General Public License as published by the Free
Software Foundation, either version 3 of the
License, or (at your option) any later version.
- AAS is distributed in the hope that it will
+ eAS is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY; without
even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU
- General Public License along with AAS. If not,
+ General Public License along with eAS. If not,
see <https://www.gnu.org/licenses/>.
*/
use crate::is_valid_character;
+use crate::error::Error;
+use crate::source_location::SourceLocation;
use crate::token::Token;
impl Token {
#[must_use]
- pub fn tokenise(input: &str) -> Result<Vec<Self>, String> {
- let mut tokens: Vec<Self> = Vec::new();
+ pub fn tokenise(input: &str, location: &mut SourceLocation) -> Result<Vec<(SourceLocation, Self)>, Error> {
+ let mut tokens: Vec<(SourceLocation, Self)> = Vec::new();
let mut input_index: usize = 0x0;
- while let Some(token) = get_next_token(&input, &mut input_index)? { tokens.push(token) }
+ while let Some(token) = get_next_token(&input, &mut input_index, location)? { tokens.push(token) }
return Ok(tokens);
}
}
#[must_use]
-fn get_next_token(input: &str, index: &mut usize) -> Result<Option<Token>, String> {
+fn get_next_token(input: &str, index: &mut usize, location: &mut SourceLocation) -> Result<Option<(SourceLocation, Token)>, Error> {
use Token::*;
- let mut word = String::new();
-
- let mut in_comment = false;
- let mut in_string = false;
-
for c in input.chars().skip(*index) {
- // Skip until we're out of the comment.
- if in_comment {
- if c != '\n' {
- *index += 0x1;
- continue;
- }
-
- in_comment = false;
- }
-
- // Finish the string (if inside one) and return.
- if in_string {
- *index += 0x1;
-
- if c != '"' {
- word.push(c);
- continue;
- }
-
- return Ok(Some(StringLiteral(word)));
- }
-
- // We don't care about invalid character inside of
- // comments or strings.
- if !is_valid_character(c) { return Err(format!("invalid character U+{:04X} '{c}' ({index} / {})", c as u32, input.len())) };
-
- // Check if the word is terminated. If it was, we
- // don't count this character.
- if !word.is_empty() {
- match c {
- | ' '
- | '\t'
- | '\n'
- | '.'
- | ','
- | ':'
- | ';'
- | '@'
- => return Ok(Some(Word(word))),
-
- _ => {},
- };
- }
+ if !is_valid_character(c) { return Err(Error::IllegalCharacter(c, location.clone()) ) };
// There aren't any more things to complete
// (comments, strings, or words), so we know now
// that no more characters will be skipped.
- *index += 0x1;
+
+ let token_start = location.clone();
+
+ match c {
+ | ' '
+ | '\t'
+ | '\n'
+ | '['
+ | ']'
+ | '.'
+ | ','
+ | '#'
+ | ';'
+ | '"'
+ => {
+ *index += 0x1;
+ location.next_column();
+ },
+
+ _ => {},
+ };
match c {
| ' '
| '\t'
=> continue,
- '\n' => return Ok(Some(Return)),
- '[' => return Ok(Some(BracketLeft)),
- ']' => return Ok(Some(BracketRight)),
- '.' => return Ok(Some(FullStop)),
- ',' => return Ok(Some(Comma)),
- ':' => return Ok(Some(Colon)),
- '#' => return Ok(Some(Hashtag)),
+ '\n' => {
+ location.return_carriage();
+ return Ok(Some((token_start, Return)));
+ },
- | ';'
- | '@'
- => {
- in_comment = true;
- continue;
+ '[' => return Ok(Some((token_start, BracketLeft))),
+ ']' => return Ok(Some((token_start, BracketRight))),
+ '.' => return Ok(Some((token_start, FullStop))),
+ ',' => return Ok(Some((token_start, Comma))),
+ '#' => return Ok(Some((token_start, Hashtag))),
+
+ ';' => {
+ skip_line(input, index, location);
+ return Ok(Some((token_start, Return)));
},
'"' => {
- in_string = true;
- continue;
+ return match complete_string(input, index, location) {
+ Ok(string) => Ok(Some((token_start, StringLiteral(string)))),
+ _ => Err(Error::UnterminatedString(token_start)),
+ };
}
_ => {},
};
- word.push(c);
+ match complete_word(input, index, location) {
+ Some(word) => return Ok(Some((token_start, Word(word)))),
+ _ => {},
+ };
}
- if in_string { return Err("unterminated string".to_string()) };
-
return Ok(None);
}
+
+#[must_use]
+fn complete_word(input: &str, index: &mut usize, location: &mut SourceLocation) -> Option<String> {
+ let mut buffer = String::new();
+
+ for c in input.chars().skip(*index) {
+ match c {
+ | ' '
+ | '\t'
+ | '\n'
+ | '.'
+ | ','
+ | ';'
+ => return Some(buffer),
+
+ _ => buffer.push(c),
+ }
+
+ // Don't count the terminating character.
+ *index += 0x1;
+ location.next_column();
+ }
+
+ return None;
+}
+
+#[must_use]
+fn complete_string(input: &str, index: &mut usize, location: &mut SourceLocation) -> Result<String, ()> {
+ let mut buffer = String::new();
+
+ for c in input.chars().skip(*index) {
+ *index += 0x1;
+
+ match c {
+ '\n' => return Err(()),
+ '"' => return Ok(buffer),
+ _ => {},
+ };
+
+ location.next_column();
+
+ buffer.push(c);
+ }
+
+ return Err(());
+}
+
+fn skip_line(input: &str, index: &mut usize, location: &mut SourceLocation) {
+ for c in input.chars().skip(*index) {
+ // Skip until we're out of the comment.
+ *index += 0x1;
+ if c == '\n' { break };
+ }
+
+ location.return_carriage();
+}