Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
from typing import Callable | |
from ast_parser.chars import ( | |
is_coefficient_start, | |
is_digit, | |
is_variable_continue, | |
is_variable_start, | |
print_char_code, | |
) | |
from ast_parser.errors import LexerException | |
from ast_parser.token import Location, Token, TokenKind | |
class Lexer: | |
"""A Lexer is a stateful stream generator in that every time it is | |
advanced, it returns the next token in the source. | |
Assuming the source lexes, the final Token emitted by the Lexer | |
will be of kind EOF, after which the Lexer will repeatedly return | |
the same EOF token whenever called. | |
Args: | |
source (str): The source string being tokenized. | |
""" | |
_source: str | |
"""The source string being tokenized.""" | |
_token: Token | |
"""The currently active Token.""" | |
_line: int | |
"""The current line number.""" | |
_line_start: int | |
"""The index of the start of the current line.""" | |
def __init__(self, source: str) -> None: | |
self._source = source | |
self._token = Token(TokenKind.SOF, 0, 0, Location(0, 0), "") | |
self._line = 1 | |
self._line_start = 0 | |
def source(self) -> str: | |
"""Gets the source string being tokenized. | |
Returns: | |
str: The source string being tokenized. | |
""" | |
return self._source | |
def __iter__(self) -> Lexer: | |
"""Gets an iterator over the Tokens in the source. | |
Returns: | |
Lexer: An iterator over the tokens in the source. | |
""" | |
return self | |
def __next__(self) -> Token: | |
"""Gets the next Token from the source. | |
Raises: | |
StopIteration: The end of the source has been reached. | |
LexerException: Unexpected character, less than operator is | |
not allowed. | |
LexerException: Unexpected character, greater than operator | |
is not allowed. | |
LexerException: Invalid character: <code>. | |
LexerException: Invalid coefficient, unexpected digit after | |
0: <code>. | |
LexerException: Invalid coefficient, expected digit but | |
got: <code>. | |
Returns: | |
Token: The next token from the source. | |
""" | |
last_token = self._token | |
next_token = self._next_token() | |
if last_token.kind == TokenKind.EOF and next_token.kind == TokenKind.EOF: | |
if last_token.prev_token is None: | |
raise LexerException( | |
self._source, | |
last_token.location, | |
"Crude modification of the token chain is detected", | |
) | |
if last_token.prev_token.kind == TokenKind.EOF: | |
raise StopIteration | |
self._token = next_token | |
return last_token | |
self._token.next_token = next_token | |
next_token.prev_token = self._token | |
self._token = next_token | |
return last_token | |
def _create_token(self, kind: TokenKind, start: int, end: int, value: str) -> Token: | |
"""Creates a token with the given parameters. | |
A token is created relative to the current state of the Lexer. | |
Args: | |
kind (TokenKind): The kind of token. | |
start (int): The index of the first character of the token. | |
end (int): The index of the first character after the token. | |
value (str): The value of the token. | |
Returns: | |
Token: | |
""" | |
location = Location(self._line, 1 + start - self._line_start) | |
return Token(kind, start, end, location, value, self._token) | |
def _read_code(self, position: int) -> int | None: | |
"""Reads the character code at the given position in the source. | |
Args: | |
position (int): The index of the character to read. | |
Returns: | |
int | None: The character code at the given position, or | |
None if the position is out of bounds. | |
""" | |
return ord(self._source[position]) if position < len(self._source) else None | |
def _read_while(self, start: int, predicate: Callable[[int], bool]) -> int: | |
"""Reads a sequence of characters from the source starting at | |
the given position while the predicate is satisfied. | |
Args: | |
start (int): The index of the first character of the token. | |
predicate (Callable[[int], bool]): A function that takes a | |
character code and returns whether it satisfies the | |
predicate. | |
Returns: | |
int: The index of the first character after the sequence. | |
""" | |
position = start | |
while position < len(self._source) and predicate(ord(self._source[position])): | |
# print( | |
# self._source[position], | |
# ord(self._source[position]), | |
# is_digit(ord(self._source[position])), | |
# ) | |
position += 1 | |
return position | |
def _read_digits(self, start: int, first_code: int | None) -> int: | |
"""Reads a sequence of digits from the source starting at the | |
given position. | |
Args: | |
start (int): The index of the first character of the token. | |
first_code (int | None): The code of the first character of | |
the token. | |
Raises: | |
LexerException: Unexpected character, expected digit but | |
got: <code>. | |
Returns: | |
int: The index of the first character after the digits. | |
""" | |
if not is_digit(first_code): # not <digit> | |
raise LexerException( | |
self._source, | |
Location(self._line, 1 + start - self._line_start), | |
f"Unexpected character, expected digit but got: {print_char_code(first_code)}", | |
) | |
return self._read_while(start + 1, is_digit) | |
def _read_coefficient(self, start: int, first_code: int) -> Token: | |
"""Reads a coefficient token from the source starting at the | |
given position. | |
Args: | |
start (int): The index of the first character of the token. | |
first_code (int): The code of the first character of the | |
token. | |
Raises: | |
LexerException: Invalid coefficient, expected digit but | |
got: <code>. | |
Returns: | |
Token: The coefficient token. | |
""" | |
position, code = start, first_code | |
# Leftmost digits. | |
if code == 0x0030: # `0` | |
position += 1 | |
code = self._read_code(position) | |
if is_digit(code): # <digit> | |
raise LexerException( | |
self._source, | |
Location(self._line, 1 + position - self._line_start), | |
f"Invalid coefficient, unexpected digit after 0: {print_char_code(code)}", | |
) | |
elif code != 0x002E: # not `.` | |
position = self._read_digits(position, code) | |
code = self._read_code(position) | |
# Rightmost digits. | |
if code == 0x002E: # `.` | |
position += 1 | |
code = self._read_code(position) | |
position = self._read_digits(position, code) | |
code = self._read_code(position) | |
# Exponent. | |
if code in (0x0045, 0x0065): # `E` | `e` | |
position += 1 | |
code = self._read_code(position) | |
if code in (0x002B, 0x002D): # `+` | `-` | |
position += 1 | |
code = self._read_code(position) | |
position = self._read_digits(position, code) | |
return self._create_token( | |
TokenKind.COEFFICIENT, start, position, self._source[start:position] | |
) | |
def _read_variable(self, start: int) -> Token: | |
"""Reads a variable token from the source starting at the given | |
position. | |
Args: | |
start (int): The index of the first character of the token. | |
Returns: | |
Token: The variable token. | |
""" | |
position = self._read_while(start + 1, is_variable_continue) | |
return self._create_token( | |
TokenKind.VARIABLE, start, position, self._source[start:position] | |
) | |
def _next_token(self) -> Token: | |
"""Gets the next token from the source starting at the given | |
position. | |
This skips over whitespace until it finds the next lexable | |
token, then lexes punctuators immediately or calls the | |
appropriate helper function for more complicated tokens. | |
Raises: | |
LexerException: Unexpected character, less than operator is | |
not allowed. | |
LexerException: Unexpected character, greater than operator | |
is not allowed. | |
LexerException: Invalid character: <code>. | |
LexerException: Invalid coefficient, unexpected digit after | |
0: <code>. | |
LexerException: Invalid coefficient, expected digit but | |
got: <code>. | |
Returns: | |
Token: The next token from the source. | |
""" | |
position = self._token.end | |
while position < len(self._source): | |
char = self._source[position] | |
code = ord(char) | |
match code: | |
# Ignored: | |
# - unicode BOM; | |
# - white space; | |
# - line terminator. | |
case 0xFEFF | 0x0009 | 0x0020: # <BOM> | `\t` | <space> | |
position += 1 | |
continue | |
case 0x000A: # `\n` | |
position += 1 | |
self._line += 1 | |
self._line_start = position | |
continue | |
case 0x000D: # `\r` | |
position += ( | |
2 if self._read_code(position + 1) == 0x000A else 1 | |
) # `\r\n` | `\r` | |
self._line += 1 | |
self._line_start = position | |
continue | |
# Single-char tokens: | |
# - binary plus and minus operators; | |
# - multiplication operator; | |
# - relational operators; | |
# - comma. | |
case 0x002B | 0x002D | 0x002A: # `+` | `-` | `*` | |
return self._create_token( | |
TokenKind(char), position, position + 1, char | |
) | |
case 0x003D: # `=` | |
if self._read_code(position + 1) == 0x003D: | |
return self._create_token( | |
TokenKind.EQ, position, position + 2, "==" | |
) | |
return self._create_token( | |
TokenKind.EQ, position, position + 1, char | |
) | |
case 0x003C: # `<` | |
if self._read_code(position + 1) == 0x003D: # `=` | |
return self._create_token( | |
TokenKind.LEQ, position, position + 2, "<=" | |
) | |
raise LexerException( | |
self._source, | |
Location(self._line, 1 + position - self._line_start), | |
"Unexpected character, less than operator is not allowed", | |
) | |
case 0x003E: # `>` | |
if self._read_code(position + 1) == 0x003D: # `=` | |
return self._create_token( | |
TokenKind.GEQ, position, position + 2, ">=" | |
) | |
raise LexerException( | |
self._source, | |
Location(self._line, 1 + position - self._line_start), | |
"Unexpected character, greater than operator is not allowed", | |
) | |
case 0x2264: # `≤` | |
return self._create_token( | |
TokenKind.LEQ, position, position + 1, char | |
) | |
case 0x2265: # `≥` | |
return self._create_token( | |
TokenKind.GEQ, position, position + 1, char | |
) | |
case 0x002C: # `,` | |
return self._create_token( | |
TokenKind.COMMA, position, position + 1, char | |
) | |
# Multi-char tokens: | |
# - coefficient; | |
# - variable. | |
if is_coefficient_start(code): # <digit> | `.` | |
return self._read_coefficient(position, code) | |
if is_variable_start(code): # <alpha> | `_` | |
return self._read_variable(position) | |
raise LexerException( | |
self._source, | |
Location(self._line, 1 + position - self._line_start), | |
f"Invalid character: {print_char_code(code)}", | |
) | |
return self._create_token(TokenKind.EOF, position, position, "") | |
__all__ = ("Lexer",) | |