Skip to content

Commit bb90fa6

Browse files
Partial lexer implementation (#8)
* add foundational structures for code->machine translation * lay tokenizer groundwork * Switch to using regex * add all regex strings * add some matching and a failing test * cleaned up register matching logic * add tests for comments and unclean register strings * instruction tokenization and test for add * tokenize pseudoops * return vector of tokens * split into multiple files * num tokens and tests * add prefix label pass for lexer * Add test for stripping prefix labels * Add wrapper function for lexer interface * token validation fn shell & fn inlining * Implementation up to instruction formation, then stuck * use strum macros instead of shell tokens * Initial lexer skeleton * add load lexing * add all multi-field instructions to lexer * Add tests for currently implemented instructions --------- Co-authored-by: Bennett Petzold <[email protected]>
1 parent e78e00f commit bb90fa6

File tree

6 files changed

+366
-31
lines changed

6 files changed

+366
-31
lines changed

Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ path = "src/testcli.rs"
2222
anyhow = "1.0.95"
2323
once_cell = "1.20.2"
2424
regex = "1.11.1"
25+
strum = { version = "0.26.3", features = ["derive"] }
26+
strum_macros = "0.26.4"
2527
# To reduce error boilerplate
2628
thiserror = "2"
2729

src/assembler/lexer.rs

+327
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
use crate::{
2+
assembler::{MaybeUnresolvedInstr, Op, PseudoOp, Token},
3+
defs::{LC3Word, RegAddr},
4+
instruction::{ADD_OPCODE, AND_OPCODE, ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, ALL_LOAD_OPCODES, ALL_STORE_OPCODES, TRAP_OPCODE, NOT_OPCODE},
5+
};
6+
use anyhow::{bail, Result};
7+
8+
// All of these functions are inlined because they work on the same exact data but are split up for
9+
// legibility
10+
11+
/// First stage of the lexer operation, where any prefix labels are stripped out
12+
#[inline]
13+
pub fn prefix_label_pass(token_chain: &[Token]) -> (Option<&str>, &[Token]) {
14+
if token_chain[0].is_string() {
15+
let label_str: &str = match &token_chain[0] {
16+
Token::STRING(label) => label.as_str(),
17+
_ => panic!("This shouldn't happen"),
18+
};
19+
(Some(label_str), &token_chain[1..])
20+
} else {
21+
(None, token_chain)
22+
}
23+
}
24+
25+
/// Second stage of the lexer operation, where a chain of unresolved instructions is created from
26+
/// the asm op. If the line consists only of a comment, then an empty Vec is returned
27+
#[inline]
28+
pub fn construct_instruction_pass(token_chain: &[Token]) -> Result<Vec<MaybeUnresolvedInstr>> {
29+
let mut result: Vec<MaybeUnresolvedInstr> = Vec::new();
30+
31+
let operation = &token_chain[0];
32+
33+
if let Token::INSTR(op) = operation {
34+
fn check_reg<const SHIFT: usize>(
35+
token: &Token,
36+
instr: &mut MaybeUnresolvedInstr,
37+
) -> Result<(), anyhow::Error> {
38+
if let Token::REGISTER(reg) = token {
39+
instr.value |= (LC3Word::from(*reg) << SHIFT);
40+
Ok(())
41+
} else {
42+
bail!("NOT REG")
43+
}
44+
}
45+
46+
fn check_offset<const SHIFT: u8, const MAX_LEN: u8>(
47+
token: &Token,
48+
instr: &mut MaybeUnresolvedInstr,
49+
) -> Result<(), anyhow::Error> {
50+
if let Token::NUM(num) = token {
51+
let max_mask = const { 1 << (MAX_LEN + 1) };
52+
if *num < max_mask {
53+
instr.value |= num << SHIFT;
54+
Ok(())
55+
} else {
56+
bail!("TOO BIG")
57+
}
58+
} else if let Token::STRING(label) = token {
59+
instr
60+
.bindings
61+
.push((label.clone(), const { SHIFT + MAX_LEN }, SHIFT));
62+
Ok(())
63+
} else {
64+
bail!("NOT OFFSET")
65+
}
66+
}
67+
68+
fn check_reg_or_offset<const SHIFT: u8, const MAX_OFFSET_LEN: u8>(
69+
token: &Token,
70+
instr: &mut MaybeUnresolvedInstr,
71+
) -> Result<(), anyhow::Error> {
72+
if let Token::REGISTER(reg) = token {
73+
instr.value |= (LC3Word::from(*reg) << SHIFT);
74+
Ok(())
75+
} else if let Token::NUM(num) = token {
76+
let max_mask = const { 1 << (MAX_OFFSET_LEN + 1) };
77+
if *num < max_mask {
78+
instr.value |= num << SHIFT;
79+
instr.value |= 1 << MAX_OFFSET_LEN;
80+
Ok(())
81+
} else {
82+
bail!("TOO BIG")
83+
}
84+
} else if let Token::STRING(label) = token {
85+
instr
86+
.bindings
87+
.push((label.clone(), const { SHIFT + MAX_OFFSET_LEN }, SHIFT));
88+
Ok(())
89+
} else {
90+
bail!("NOT REG OR OFFSET")
91+
}
92+
}
93+
94+
let (opcode, sequence) = match op {
95+
Op::ADD => (
96+
ADD_OPCODE,
97+
[check_reg::<9>, check_reg::<6>, check_reg_or_offset::<0, 5>].as_slice(),
98+
),
99+
Op::AND => (
100+
AND_OPCODE,
101+
[check_reg::<9>, check_reg::<6>, check_reg_or_offset::<0, 5>].as_slice()),
102+
Op::LD => (
103+
ALL_LOAD_OPCODES[0],
104+
[check_reg::<9>, check_offset::<0, 9>].as_slice()
105+
),
106+
Op::LDI => (
107+
ALL_LOAD_OPCODES[1],
108+
[check_reg::<9>, check_offset::<0, 9>].as_slice()
109+
),
110+
Op::LDR => (
111+
ALL_LOAD_OPCODES[2],
112+
[check_reg::<9>, check_reg::<6>, check_offset::<0, 6>].as_slice()
113+
),
114+
Op::LEA => (
115+
ALL_LOAD_OPCODES[3],
116+
[check_reg::<9>, check_offset::<0, 9>].as_slice()
117+
),
118+
Op::ST => (
119+
ALL_STORE_OPCODES[0],
120+
[check_reg::<9>, check_offset::<0, 9>].as_slice()
121+
),
122+
Op::STI => (
123+
ALL_STORE_OPCODES[1],
124+
[check_reg::<9>, check_offset::<0, 9>].as_slice()
125+
),
126+
Op::STR => (
127+
ALL_STORE_OPCODES[2],
128+
[check_reg::<9>, check_reg::<6>, check_offset::<0, 6>].as_slice()
129+
),
130+
Op::NOT => (
131+
NOT_OPCODE,
132+
[check_reg::<9>, check_reg::<6>].as_slice()
133+
),
134+
_ => todo!(),
135+
};
136+
137+
let mut instr = MaybeUnresolvedInstr {
138+
// Shift opcode to start
139+
value: (opcode as LC3Word) << 12,
140+
bindings: Vec::new(),
141+
};
142+
143+
for (process, token) in sequence.iter().zip(&token_chain[1..]) {
144+
process(token, &mut instr)?;
145+
}
146+
147+
result.push(instr);
148+
} else if operation.is_meta() {
149+
todo!()
150+
} else if !operation.is_comment() {
151+
bail!("Line is invalid, does not start with an instruction!")
152+
}
153+
154+
Ok(result)
155+
}
156+
157+
/// Wrapper function to provide a cleaner API for the lexing passes
158+
pub fn lexer(token_chain: &[Token]) -> (Option<&str>, Result<Vec<MaybeUnresolvedInstr>>) {
159+
let (label, chain) = prefix_label_pass(token_chain);
160+
let result = construct_instruction_pass(chain);
161+
162+
// The result gets passed on so the assembler can attatch more context to any error messages
163+
// generated (i.e. the expected address of the error)
164+
(label, result)
165+
}
166+
167+
#[cfg(test)]
168+
mod test {
169+
use super::*;
170+
171+
#[test]
172+
fn lex_label_instr() {
173+
let test_vec = vec![
174+
Token::STRING("LABEL1".to_string()),
175+
Token::INSTR(Op::ILLEGAL),
176+
];
177+
let (label, instr) = prefix_label_pass(&test_vec);
178+
179+
assert_eq!(label.unwrap(), "LABEL1");
180+
assert_eq!(instr[0], Token::INSTR(Op::ILLEGAL));
181+
}
182+
183+
#[test]
184+
fn lex_and_instr() {
185+
let test_vec = vec![
186+
Token::STRING("LABEL1".to_string()),
187+
Token::INSTR(Op::AND),
188+
Token::REGISTER(RegAddr::Zero),
189+
Token::REGISTER(RegAddr::One),
190+
Token::REGISTER(RegAddr::Zero)
191+
];
192+
let (label, instr) = lexer(&test_vec);
193+
194+
assert_eq!(label.unwrap(), "LABEL1");
195+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0101000001000000);
196+
197+
let test_vec = vec![
198+
Token::INSTR(Op::AND),
199+
Token::REGISTER(RegAddr::Three),
200+
Token::REGISTER(RegAddr::One),
201+
Token::NUM(0b10011)
202+
];
203+
let (label, instr) = lexer(&test_vec);
204+
205+
assert_eq!(label, None);
206+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0101011001110011);
207+
}
208+
209+
#[test]
210+
fn lex_add_instr() {
211+
let test_vec = vec![
212+
Token::STRING("LABEL1".to_string()),
213+
Token::INSTR(Op::ADD),
214+
Token::REGISTER(RegAddr::Zero),
215+
Token::REGISTER(RegAddr::One),
216+
Token::REGISTER(RegAddr::Zero)
217+
];
218+
let (label, instr) = lexer(&test_vec);
219+
220+
assert_eq!(label.unwrap(), "LABEL1");
221+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0001000001000000);
222+
223+
let test_vec = vec![
224+
Token::INSTR(Op::ADD),
225+
Token::REGISTER(RegAddr::Three),
226+
Token::REGISTER(RegAddr::One),
227+
Token::NUM(0b10011)
228+
];
229+
let (label, instr) = lexer(&test_vec);
230+
231+
assert_eq!(label, None);
232+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0001011001110011);
233+
}
234+
235+
#[test]
236+
fn lex_load_instrs() {
237+
let test_vec = vec![
238+
Token::INSTR(Op::LD),
239+
Token::REGISTER(RegAddr::Five),
240+
Token::NUM(0b000111000)
241+
];
242+
let (label, instr) = lexer(&test_vec);
243+
244+
assert_eq!(label, None);
245+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0010101000111000);
246+
247+
let test_vec = vec![
248+
Token::INSTR(Op::LDI),
249+
Token::REGISTER(RegAddr::Five),
250+
Token::NUM(0b000111000)
251+
];
252+
let (label, instr) = lexer(&test_vec);
253+
254+
assert_eq!(label, None);
255+
assert_eq!(instr.unwrap().first().unwrap().value, 0b1010101000111000);
256+
257+
let test_vec = vec![
258+
Token::INSTR(Op::LDR),
259+
Token::REGISTER(RegAddr::Five),
260+
Token::REGISTER(RegAddr::Two),
261+
Token::NUM(0b111000)
262+
];
263+
let (label, instr) = lexer(&test_vec);
264+
265+
assert_eq!(label, None);
266+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0110101010111000);
267+
268+
let test_vec = vec![
269+
Token::INSTR(Op::LEA),
270+
Token::REGISTER(RegAddr::Five),
271+
Token::NUM(0b000111000)
272+
];
273+
let (label, instr) = lexer(&test_vec);
274+
275+
assert_eq!(label, None);
276+
assert_eq!(instr.unwrap().first().unwrap().value, 0b1110101000111000);
277+
}
278+
279+
#[test]
280+
fn lex_store_instrs() {
281+
let test_vec = vec![
282+
Token::INSTR(Op::ST),
283+
Token::REGISTER(RegAddr::Five),
284+
Token::NUM(0b000111000)
285+
];
286+
let (label, instr) = lexer(&test_vec);
287+
288+
assert_eq!(label, None);
289+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0011101000111000);
290+
291+
let test_vec = vec![
292+
Token::INSTR(Op::STI),
293+
Token::REGISTER(RegAddr::Five),
294+
Token::NUM(0b000111000)
295+
];
296+
let (label, instr) = lexer(&test_vec);
297+
298+
assert_eq!(label, None);
299+
assert_eq!(instr.unwrap().first().unwrap().value, 0b1011101000111000);
300+
301+
let test_vec = vec![
302+
Token::INSTR(Op::STR),
303+
Token::REGISTER(RegAddr::Five),
304+
Token::REGISTER(RegAddr::Two),
305+
Token::NUM(0b111000)
306+
];
307+
let (label, instr) = lexer(&test_vec);
308+
309+
assert_eq!(label, None);
310+
assert_eq!(instr.unwrap().first().unwrap().value, 0b0111101010111000);
311+
}
312+
313+
#[test]
314+
fn lex_not_instr() {
315+
let test_vec = vec![
316+
Token::INSTR(Op::NOT),
317+
Token::REGISTER(RegAddr::Five),
318+
Token::REGISTER(RegAddr::Zero),
319+
];
320+
let (label, instr) = lexer(&test_vec);
321+
322+
assert_eq!(label, None);
323+
// This is the value that should be produced. Currently this fails, as there is no way to
324+
// insert arbitrary bits into instructions when forming them.
325+
assert_eq!(instr.unwrap().first().unwrap().value, 0b1001101000111111);
326+
}
327+
}

src/assembler/mod.rs

+19-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,28 @@
1-
use crate::defs::LC3Word;
1+
use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};
2+
use strum::EnumIs;
3+
use strum_macros::EnumDiscriminants;
24

5+
pub mod lexer;
36
pub mod tokenizer;
47

8+
#[derive(Debug, Clone, Eq, PartialEq)]
59
pub struct MaybeUnresolvedInstr {
610
value: LC3Word,
711
///Label, Start offset, End offset
8-
bindings: Option<(String, u8, u8)>,
12+
bindings: Vec<(String, u8, u8)>,
13+
}
14+
15+
#[derive(Debug, Clone, Eq, PartialEq, EnumIs, EnumDiscriminants)]
16+
pub enum Token {
17+
INSTR(Op),
18+
REGISTER(RegAddr),
19+
META(PseudoOp),
20+
STRING(String),
21+
NUM(LC3Word),
22+
COMMENT(String),
23+
QUOTES,
24+
SEMICOLON,
25+
COMMA,
926
}
1027

1128
pub fn translate_line(line: &str) -> MaybeUnresolvedInstr {

src/assembler/tokenizer.rs

+2-14
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,8 @@ use anyhow::{bail, Result};
22
use once_cell::sync::Lazy;
33
use regex::{bytes::RegexSet, Regex};
44

5-
use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};
6-
7-
#[derive(Debug, Clone, Eq, PartialEq)]
8-
pub enum Token {
9-
INSTR(Op),
10-
REGISTER(RegAddr),
11-
META(PseudoOp),
12-
STRING(String),
13-
NUM(LC3Word),
14-
COMMENT(String),
15-
QUOTES,
16-
SEMICOLON,
17-
COMMA,
18-
}
5+
use crate::assembler::Token;
6+
use crate::defs::{Op, PseudoOp, RegAddr};
197

208
// This follows the same ordering as defs.rs > pub enum Op
219
const INSTR_PATTERN: [&str; 23] = [

0 commit comments

Comments
 (0)