src/token/tokenise.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

/*
	Copyright 2023 Gabriel Jensen.

	This file is part of aas.

	aas is free software: you can redistribute it
	and/or modify it under the terms of the GNU
	General Public License as published by the Free
	Software Foundation, either version 3 of the
	License, or (at your option) any later version.

	aas is distributed in the hope that it will
	be useful, but WITHOUT ANY WARRANTY; without
	even the implied warranty of MERCHANTABILITY or
	FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.

	You should have received a copy of the GNU
	General Public License along with aas. If not,
	see <https://www.gnu.org/licenses/>.
*/

use crate::is_valid_character;
use crate::token::Token;

impl Token {
	#[must_use]
	pub fn tokenise(input: &str) -> Result<Vec<Self>, String> {
		let mut tokens: Vec<Self> = Vec::new();

		let mut input_index: usize = 0x0;
		while let Some(token) = get_next_token(&input, &mut input_index)? { tokens.push(token) }

		return Ok(tokens);
	}
}

#[must_use]
fn get_next_token(input: &str, index: &mut usize) -> Result<Option<Token>, String> {
	use Token::*;

	let mut word = String::new();

	let mut in_comment = false;
	let mut in_string  = false;

	for c in input.chars().skip(*index) {
		// Skip until we're out of the comment.
		if in_comment {
			if c != '\n' {
				*index += 0x1;
				continue;
			}

			in_comment = false;
		}

		// Finish the string (if inside one) and return.
		if in_string {
			*index += 0x1;

			if c != '"' {
				word.push(c);
				continue;
			}

			return Ok(Some(StringLiteral(word)));
		}

		// We don't care about invalid character inside of
		// comments or strings.
		if !is_valid_character(c) { return Err(format!("invalid character U+{:04X} '{c}' ({index} / {})", c as u32, input.len())) };

		// Check if the word is terminated. If it was, we
		// don't count this character.
		if !word.is_empty() {
			match c {
				| ' '
				| '\t'
				| '\n'
				| '.'
				| ','
				| ':'
				| ';'
				| '@'
				=> return Ok(Some(Word(word))),

				_ => {},
			};
		}

		// There aren't any more things to complete
		// (comments, strings, or words), so we know now
		// that no more characters will be skipped.
		*index += 0x1;

		match c {
			| ' '
			| '\t'
			=> continue,

			'\n' => return Ok(Some(Return)),
			'['  => return Ok(Some(BracketLeft)),
			']'  => return Ok(Some(BracketRight)),
			'.'  => return Ok(Some(Fullstop)),
			','  => return Ok(Some(Comma)),
			':'  => return Ok(Some(Colon)),
			'#'  => return Ok(Some(Hashtag)),

			| ';'
			| '@'
			=> {
				in_comment = true;
				continue;
			},

			'"' => {
				in_string = true;
				continue;
			}

			_ => {},
		};

		word.push(c);
	}

	if in_string { return Err("unterminated string".to_string()) };

	return Ok(None);
}