summaryrefslogblamecommitdiff
path: root/src/token/tokenise.rs
blob: 44ce6832ae4f382ef24ee8f484d66001ad8abd1d (plain) (tree)


































































































































                                                                                                                                            
/*
	Copyright 2023 Gabriel Jensen.

	This file is part of aas.

	aas is free software: you can redistribute it
	and/or modify it under the terms of the GNU
	General Public License as published by the Free
	Software Foundation, either version 3 of the
	License, or (at your option) any later version.

	aas is distributed in the hope that it will
	be useful, but WITHOUT ANY WARRANTY; without
	even the implied warranty of MERCHANTABILITY or
	FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.

	You should have received a copy of the GNU
	General Public License along with aas. If not,
	see <https://www.gnu.org/licenses/>.
*/

use crate::is_valid_character;
use crate::token::Token;

impl Token {
	#[must_use]
	pub fn tokenise(input: &str) -> Result<Vec<Self>, String> {
		let mut tokens: Vec<Self> = Vec::new();

		let mut input_index: usize = 0x0;
		while let Some(token) = get_next_token(&input, &mut input_index)? { tokens.push(token) }

		return Ok(tokens);
	}
}

#[must_use]
fn get_next_token(input: &str, index: &mut usize) -> Result<Option<Token>, String> {
	use Token::*;

	let mut word = String::new();

	let mut in_comment = false;
	let mut in_string  = false;

	for c in input.chars().skip(*index) {
		// Skip until we're out of the comment.
		if in_comment {
			if c != '\n' {
				*index += 0x1;
				continue;
			}

			in_comment = false;
		}

		// Finish the string (if inside one) and return.
		if in_string {
			*index += 0x1;

			if c != '"' {
				word.push(c);
				continue;
			}

			return Ok(Some(StringLiteral(word)));
		}

		// We don't care about invalid character inside of
		// comments or strings.
		if !is_valid_character(c) { return Err(format!("invalid character U+{:04X} '{c}' ({index} / {})", c as u32, input.len())) };

		// Check if the word is terminated. If it was, we
		// don't count this character.
		if !word.is_empty() {
			match c {
				| ' '
				| '\t'
				| '\n'
				| '.'
				| ','
				| ':'
				| ';'
				| '@'
				=> return Ok(Some(Word(word))),

				_ => {},
			};
		}

		// There aren't any more things to complete
		// (comments, strings, or words), so we know now
		// that no more characters will be skipped.
		*index += 0x1;

		match c {
			| ' '
			| '\t'
			=> continue,

			'\n' => return Ok(Some(Return)),
			'['  => return Ok(Some(BracketLeft)),
			']'  => return Ok(Some(BracketRight)),
			'.'  => return Ok(Some(Fullstop)),
			','  => return Ok(Some(Comma)),
			':'  => return Ok(Some(Colon)),
			'#'  => return Ok(Some(Hashtag)),

			| ';'
			| '@'
			=> {
				in_comment = true;
				continue;
			},

			'"' => {
				in_string = true;
				continue;
			}

			_ => {},
		};

		word.push(c);
	}

	if in_string { return Err("unterminated string".to_string()) };

	return Ok(None);
}