Skip to main content

revmc_codegen/bytecode/
asm.rs

1//! EVM bytecode assembler.
2
3use crate::{
4    U256, encode_pair, encode_single,
5    eyre::{self, Result},
6};
7use revm_bytecode::opcode::{self as op, OpCode};
8use revm_primitives::map::HashMap;
9use std::cmp::Ordering;
10
11/// Parse EVM assembly from a string into bytecode.
12///
13/// Assembles EVM mnemonics from a string into raw bytecode. Supports:
14/// - Standard EVM opcodes (`ADD`, `PUSH1 0x42`, etc.)
15/// - Auto-sized pushes (`PUSH 0x1234` picks the smallest encoding)
16/// - Labels: `name:` defines a label at the current PC, `PUSH %name` / `PUSHn %name` resolves to
17///   the label's byte offset
18/// - Comments starting with `;`
19/// - C-style `#define` macros (textual expansion before parsing), with optional parameters
20///
21/// ```evm
22/// #define PUSH_TWO(a, b) PUSH $a PUSH $b
23///
24/// entry:
25///   PUSH_TWO(1, 2)
26///   ADD
27///   PUSH %target
28///   JUMP
29///
30/// target:
31///   JUMPDEST
32///   STOP
33/// ```
34pub fn parse_asm(s: &str) -> Result<Vec<u8>> {
35    let tokens = preprocess(s)?;
36    let items = parse_items(&tokens)?;
37    layout_and_emit(&items)
38}
39
40// ———————————————————————————————————————————————————————————————————————
41// Tokenizer
42// ———————————————————————————————————————————————————————————————————————
43
44/// A token produced by the tokenizer.
45#[derive(Debug, Clone, PartialEq, Eq)]
46pub(super) struct Token<'a> {
47    /// The source text slice this token was produced from.
48    pub src: &'a str,
49    /// The kind of token.
50    pub kind: TokenKind,
51}
52
53/// The kind of a [`Token`].
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub(super) enum TokenKind {
56    /// An identifier (opcode name, macro name, etc.).
57    Ident,
58    /// A label definition (`name:`).
59    Label,
60    /// A label reference (`%name`).
61    LabelRef,
62    /// A numeric literal.
63    Number(U256),
64    /// A comma.
65    Comma,
66    /// Opening parenthesis.
67    LParen,
68    /// Closing parenthesis.
69    RParen,
70    /// A macro parameter reference (`$name`).
71    ParamRef,
72    /// Whitespace.
73    Whitespace,
74    /// Comment text including the leading `;`.
75    Comment,
76    /// Unrecognized input.
77    Unknown,
78}
79
80/// Character-by-character tokenizer over source text.
81///
82/// Always emits all tokens including whitespace and comments.
83pub(super) struct Tokenizer<'a> {
84    src: &'a str,
85    pos: usize,
86}
87
88impl<'a> Tokenizer<'a> {
89    pub(super) fn new(src: &'a str) -> Self {
90        Self { src, pos: 0 }
91    }
92
93    fn remaining(&self) -> &'a str {
94        &self.src[self.pos..]
95    }
96
97    fn peek_char(&self) -> Option<char> {
98        self.remaining().chars().next()
99    }
100
101    fn advance(&mut self, n: usize) {
102        self.pos += n;
103    }
104
105    /// Read a contiguous word of alphanumeric/underscore characters.
106    fn read_word(&mut self) -> &'a str {
107        let rest = self.remaining();
108        let end = rest.find(|c: char| !c.is_ascii_alphanumeric() && c != '_').unwrap_or(rest.len());
109        let word = &rest[..end];
110        self.advance(end);
111        word
112    }
113
114    /// Read a numeric literal (decimal or 0x hex).
115    fn read_number(&mut self) -> Token<'a> {
116        let rest = self.remaining();
117        let end = if rest.starts_with("0x") || rest.starts_with("0X") {
118            2 + rest[2..].find(|c: char| !c.is_ascii_hexdigit()).unwrap_or(rest.len() - 2)
119        } else {
120            rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len())
121        };
122        let src = &rest[..end];
123        self.advance(end);
124        match src.parse::<U256>() {
125            Ok(n) => Token { src, kind: TokenKind::Number(n) },
126            Err(_) => Token { src, kind: TokenKind::Unknown },
127        }
128    }
129}
130
131impl<'a> Iterator for Tokenizer<'a> {
132    type Item = Token<'a>;
133
134    fn next(&mut self) -> Option<Token<'a>> {
135        // Emit whitespace as a token.
136        let rest = self.remaining();
137        let trimmed = rest.trim_start_matches(|c: char| c.is_ascii_whitespace());
138        let ws_len = rest.len() - trimmed.len();
139        if ws_len > 0 {
140            let src = &self.src[self.pos..self.pos + ws_len];
141            self.advance(ws_len);
142            return Some(Token { src, kind: TokenKind::Whitespace });
143        }
144
145        let c = self.peek_char()?;
146
147        match c {
148            ';' => {
149                let start = self.pos;
150                if let Some(nl) = self.remaining().find('\n') {
151                    self.advance(nl);
152                } else {
153                    self.pos = self.src.len();
154                }
155                Some(Token { src: &self.src[start..self.pos], kind: TokenKind::Comment })
156            }
157
158            '%' => {
159                let start = self.pos;
160                self.advance(1);
161                let name = self.read_word();
162                Some(if name.is_empty() {
163                    Token { src: &self.src[start..self.pos], kind: TokenKind::Unknown }
164                } else {
165                    Token { src: name, kind: TokenKind::LabelRef }
166                })
167            }
168
169            '$' => {
170                let start = self.pos;
171                self.advance(1);
172                let name = self.read_word();
173                Some(if name.is_empty() {
174                    Token { src: &self.src[start..self.pos], kind: TokenKind::Unknown }
175                } else {
176                    Token { src: name, kind: TokenKind::ParamRef }
177                })
178            }
179
180            ',' => {
181                let src = &self.src[self.pos..self.pos + 1];
182                self.advance(1);
183                Some(Token { src, kind: TokenKind::Comma })
184            }
185            '(' => {
186                let src = &self.src[self.pos..self.pos + 1];
187                self.advance(1);
188                Some(Token { src, kind: TokenKind::LParen })
189            }
190            ')' => {
191                let src = &self.src[self.pos..self.pos + 1];
192                self.advance(1);
193                Some(Token { src, kind: TokenKind::RParen })
194            }
195
196            '0'..='9' => Some(self.read_number()),
197
198            _ if c.is_ascii_alphabetic() || c == '_' => {
199                let word = self.read_word();
200                if self.peek_char() == Some(':') {
201                    self.advance(1);
202                    Some(Token { src: word, kind: TokenKind::Label })
203                } else {
204                    Some(Token { src: word, kind: TokenKind::Ident })
205                }
206            }
207
208            ':' => {
209                self.advance(1);
210                Some(Token { src: "", kind: TokenKind::Label })
211            }
212
213            _ => {
214                let start = self.pos;
215                // Consume consecutive unrecognized characters.
216                while let Some(c) = self.peek_char() {
217                    if c.is_ascii_whitespace()
218                        || c.is_ascii_alphanumeric()
219                        || matches!(c, '_' | ';' | '%' | '$' | ',' | '(' | ')' | ':')
220                    {
221                        break;
222                    }
223                    self.advance(c.len_utf8());
224                }
225                Some(Token { src: &self.src[start..self.pos], kind: TokenKind::Unknown })
226            }
227        }
228    }
229}
230
231// ———————————————————————————————————————————————————————————————————————
232// Preprocessor (#define macros)
233// ———————————————————————————————————————————————————————————————————————
234
235/// A macro definition: parameter names and body tokens.
236struct MacroDef<'a> {
237    /// Whether this is a function-like macro (invoked with parentheses).
238    is_fn: bool,
239    params: Vec<&'a str>,
240    body: Vec<Token<'a>>,
241}
242
243/// Builtin macros available in all assembly sources.
244fn builtin_macros() -> HashMap<&'static str, MacroDef<'static>> {
245    let mut m = HashMap::default();
246    m.insert(
247        "RET_WORD",
248        MacroDef {
249            is_fn: false,
250            params: vec![],
251            body: vec![
252                Token { src: "PUSH0", kind: TokenKind::Ident },
253                Token { src: "MSTORE", kind: TokenKind::Ident },
254                Token { src: "PUSH1", kind: TokenKind::Ident },
255                Token { src: "0x20", kind: TokenKind::Number(U256::from(0x20)) },
256                Token { src: "PUSH0", kind: TokenKind::Ident },
257                Token { src: "RETURN", kind: TokenKind::Ident },
258            ],
259        },
260    );
261    m
262}
263
264/// Preprocess source text: extract `#define` directives (line-scoped), tokenize the rest,
265/// then expand macro invocations on the token stream.
266fn preprocess(s: &str) -> Result<Vec<Token<'_>>> {
267    let mut macros = builtin_macros();
268
269    // Extract #define lines (tokenize their bodies in-place); keep remaining lines.
270    // Note: `#define` bodies borrow from `s` since their source text lives in `s`.
271    let mut rest_start = Vec::new();
272    for line in s.lines() {
273        let trimmed = line.trim();
274        if let Some(after) = trimmed.strip_prefix("#define")
275            && (after.is_empty() || after.starts_with(|c: char| c.is_ascii_whitespace()))
276        {
277            parse_define(after, &mut macros)?;
278        } else {
279            // Record (start, end) byte offsets into `s` for non-directive lines.
280            let offset = trimmed.as_ptr() as usize - s.as_ptr() as usize;
281            rest_start.push((offset, offset + trimmed.len()));
282        }
283    }
284
285    // Tokenize non-directive lines (borrowing from `s`), skipping whitespace and comments.
286    let mut raw = Vec::new();
287    for &(start, end) in &rest_start {
288        let line = &s[start..end];
289        raw.extend(
290            Tokenizer::new(line)
291                .filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Comment)),
292        );
293    }
294
295    if macros.is_empty() {
296        return Ok(raw);
297    }
298
299    expand_macros(raw, &macros)
300}
301
302/// Parse a `#define` directive body (everything after `#define`) into the macro table.
303fn parse_define<'a>(after: &'a str, macros: &mut HashMap<&'a str, MacroDef<'a>>) -> Result<()> {
304    let mut tok = Tokenizer::new(after)
305        .filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Comment))
306        .peekable();
307
308    let name = match tok.next() {
309        Some(Token { src, kind: TokenKind::Ident }) => src,
310        Some(other) => eyre::bail!("expected macro name after #define, got {other:?}"),
311        None => eyre::bail!("expected macro name after #define"),
312    };
313
314    // Function-like macro: NAME(a, b).
315    // Only if '(' immediately follows the name (no whitespace), matching C preprocessor semantics.
316    let is_fn = after.as_bytes().get(name.as_ptr() as usize - after.as_ptr() as usize + name.len())
317        == Some(&b'(');
318
319    let all_tokens: Vec<Token<'a>> = tok.collect();
320    let mut i = 0;
321
322    let mut params = Vec::new();
323    if is_fn && matches!(all_tokens.get(i), Some(Token { kind: TokenKind::LParen, .. })) {
324        i += 1; // consume '('
325        if !matches!(all_tokens.get(i), Some(Token { kind: TokenKind::RParen, .. })) {
326            loop {
327                match all_tokens.get(i) {
328                    Some(Token { src: p, kind: TokenKind::Ident }) => {
329                        params.push(*p);
330                        i += 1;
331                    }
332                    other => {
333                        eyre::bail!("expected parameter name in #define {name}, got {other:?}")
334                    }
335                }
336                match all_tokens.get(i) {
337                    Some(Token { kind: TokenKind::RParen, .. }) => {
338                        i += 1;
339                        break;
340                    }
341                    Some(Token { kind: TokenKind::Comma, .. }) => i += 1,
342                    other => eyre::bail!(
343                        "expected ',' or ')' in #define {name} parameter list, got {other:?}"
344                    ),
345                }
346            }
347        } else {
348            i += 1; // consume ')'
349        }
350    }
351
352    let body = all_tokens[i..].to_vec();
353    macros.insert(name, MacroDef { is_fn, params, body });
354    Ok(())
355}
356
357/// Expand macro invocations in a token stream.
358fn expand_macros<'a>(
359    tokens: Vec<Token<'a>>,
360    macros: &HashMap<&str, MacroDef<'a>>,
361) -> Result<Vec<Token<'a>>> {
362    let mut out = Vec::with_capacity(tokens.len());
363    let mut iter = tokens.into_iter().peekable();
364
365    while let Some(tok) = iter.next() {
366        let TokenKind::Ident = &tok.kind else {
367            out.push(tok);
368            continue;
369        };
370        let Some(mac) = macros.get(tok.src) else {
371            out.push(tok);
372            continue;
373        };
374        let name = tok.src;
375
376        if !mac.is_fn {
377            // Object-like macro: simple body substitution.
378            out.extend(mac.body.iter().cloned());
379        } else {
380            // Function-like macro: consume `(arg1, arg2, ...)`.
381            eyre::ensure!(
382                matches!(iter.next(), Some(Token { kind: TokenKind::LParen, .. })),
383                "macro {name:?} expects arguments",
384            );
385
386            // Parse arguments, handling nested parens.
387            let mut args: Vec<Vec<Token<'a>>> = vec![vec![]];
388            let mut depth = 1u32;
389            loop {
390                let t = iter
391                    .next()
392                    .ok_or_else(|| eyre::eyre!("unclosed '(' in macro invocation {name:?}"))?;
393                match &t.kind {
394                    TokenKind::LParen => {
395                        depth += 1;
396                        args.last_mut().unwrap().push(t);
397                    }
398                    TokenKind::RParen => {
399                        depth -= 1;
400                        if depth == 0 {
401                            break;
402                        }
403                        args.last_mut().unwrap().push(t);
404                    }
405                    TokenKind::Comma if depth == 1 => args.push(vec![]),
406                    _ => args.last_mut().unwrap().push(t),
407                }
408            }
409
410            // Zero-arg function-like: `FOO()` produces args = [[]]; expect 0.
411            if mac.params.is_empty() {
412                eyre::ensure!(
413                    args.len() == 1 && args[0].is_empty(),
414                    "macro {name:?} takes no arguments"
415                );
416            } else {
417                eyre::ensure!(
418                    args.len() == mac.params.len(),
419                    "macro {name:?} expects {} argument(s), got {}",
420                    mac.params.len(),
421                    args.len()
422                );
423            }
424
425            // Substitute $param refs in the body.
426            for body_tok in &mac.body {
427                if let TokenKind::ParamRef = body_tok.kind
428                    && let Some(idx) = mac.params.iter().position(|p| *p == body_tok.src)
429                {
430                    out.extend(args[idx].iter().cloned());
431                } else {
432                    out.push(body_tok.clone());
433                }
434            }
435        }
436    }
437    Ok(out)
438}
439
440// ———————————————————————————————————————————————————————————————————————
441// Parser (tokens → items)
442// ———————————————————————————————————————————————————————————————————————
443
444/// A parsed item from the source.
445enum Item<'a> {
446    /// A label definition (`name:`).
447    Label(&'a str),
448    /// An instruction.
449    Inst(Inst<'a>),
450}
451
452/// A parsed instruction.
453struct Inst<'a> {
454    opcode: u8,
455    imm: Option<Imm<'a>>,
456    push_kind: PushKind,
457}
458
459/// An immediate value.
460enum Imm<'a> {
461    /// A numeric literal.
462    Number(U256),
463    /// A label reference (resolved during layout).
464    Label(&'a str),
465}
466
467/// How the push width is determined.
468enum PushKind {
469    /// Not a push instruction.
470    None,
471    /// Fixed width (`PUSH1`..`PUSH32`).
472    Fixed(u8),
473    /// Auto-sized (`PUSH`).
474    Auto,
475}
476
477/// Parse a token stream into items.
478fn parse_items<'a>(tokens: &[Token<'a>]) -> Result<Vec<Item<'a>>> {
479    let mut items = Vec::new();
480    let mut i = 0;
481    while i < tokens.len() {
482        match &tokens[i].kind {
483            TokenKind::Label => {
484                let name = tokens[i].src;
485                eyre::ensure!(!name.is_empty(), "empty label name");
486                items.push(Item::Label(name));
487                i += 1;
488            }
489            TokenKind::Ident => {
490                let word = tokens[i].src;
491                if word == "PUSH" {
492                    i += 1;
493                    let imm = expect_imm(tokens, &mut i, "PUSH")?;
494                    items.push(Item::Inst(Inst {
495                        opcode: 0,
496                        imm: Some(imm),
497                        push_kind: PushKind::Auto,
498                    }));
499                } else if word == "DUP" || word == "SWAP" {
500                    let is_swap = word == "SWAP";
501                    i += 1;
502                    let n = expect_number_u8(tokens, &mut i, word)?;
503                    eyre::ensure!(n >= 1, "{word} index must be >= 1, got {n}");
504                    if n <= 16 {
505                        let base = if is_swap { op::SWAP1 } else { op::DUP1 };
506                        items.push(Item::Inst(Inst {
507                            opcode: base + n - 1,
508                            imm: None,
509                            push_kind: PushKind::None,
510                        }));
511                    } else {
512                        let eof_op = if is_swap { op::SWAPN } else { op::DUPN };
513                        let raw = encode_single(n).ok_or_else(|| {
514                            eyre::eyre!("{word} index {n} out of valid range [1, 235]")
515                        })?;
516                        items.push(Item::Inst(Inst {
517                            opcode: eof_op,
518                            imm: Some(Imm::Number(U256::from(raw))),
519                            push_kind: PushKind::Fixed(1),
520                        }));
521                    }
522                } else {
523                    let opc = OpCode::parse(word)
524                        .ok_or_else(|| eyre::eyre!("invalid opcode: {word:?}"))?;
525                    let opcode = opc.get();
526                    i += 1;
527
528                    if opcode == op::DUPN || opcode == op::SWAPN {
529                        let n = expect_number_u8(tokens, &mut i, opc)?;
530                        let raw = encode_single(n).ok_or_else(|| {
531                            eyre::eyre!("{opc} index {n} out of valid range [17, 235]")
532                        })?;
533                        items.push(Item::Inst(Inst {
534                            opcode,
535                            imm: Some(Imm::Number(U256::from(raw))),
536                            push_kind: PushKind::Fixed(1),
537                        }));
538                    } else if opcode == op::EXCHANGE {
539                        let n = expect_number_u8(tokens, &mut i, opc)?;
540                        let m = expect_number_u8(tokens, &mut i, opc)?;
541                        let raw = encode_pair(n, m).ok_or_else(|| {
542                            eyre::eyre!("EXCHANGE pair ({n}, {m}) cannot be encoded")
543                        })?;
544                        items.push(Item::Inst(Inst {
545                            opcode,
546                            imm: Some(Imm::Number(U256::from(raw))),
547                            push_kind: PushKind::Fixed(1),
548                        }));
549                    } else {
550                        let imm_len = opc.info().immediate_size();
551                        if imm_len > 0 {
552                            let imm = expect_imm(tokens, &mut i, opc)?;
553                            items.push(Item::Inst(Inst {
554                                opcode,
555                                imm: Some(imm),
556                                push_kind: PushKind::Fixed(imm_len),
557                            }));
558                        } else {
559                            if matches!(
560                                tokens.get(i),
561                                Some(Token { kind: TokenKind::Number(_), .. })
562                            ) {
563                                eyre::bail!("unexpected immediate for opcode {opc}");
564                            }
565                            items.push(Item::Inst(Inst {
566                                opcode,
567                                imm: None,
568                                push_kind: PushKind::None,
569                            }));
570                        }
571                    }
572                }
573            }
574            TokenKind::Unknown => eyre::bail!("unexpected token: {:?}", tokens[i].src),
575            _ => eyre::bail!("unexpected token: {:?}", tokens[i]),
576        }
577    }
578    Ok(items)
579}
580
581/// Consume the next token as a number and convert to u8.
582fn expect_number_u8(
583    tokens: &[Token<'_>],
584    i: &mut usize,
585    ctx: impl std::fmt::Display,
586) -> Result<u8> {
587    let tok = tokens.get(*i).ok_or_else(|| eyre::eyre!("missing immediate for opcode {ctx}"))?;
588    *i += 1;
589    match &tok.kind {
590        TokenKind::Number(n) => {
591            let v: u64 =
592                n.try_into().map_err(|_| eyre::eyre!("invalid {ctx} immediate: too large"))?;
593            u8::try_from(v).map_err(|_| eyre::eyre!("invalid {ctx} immediate: too large"))
594        }
595        _ => eyre::bail!("expected numeric immediate for {ctx}, got {tok:?}"),
596    }
597}
598
599/// Consume the next token as an immediate (number or label ref).
600fn expect_imm<'a>(
601    tokens: &[Token<'a>],
602    i: &mut usize,
603    ctx: impl std::fmt::Display,
604) -> Result<Imm<'a>> {
605    let tok = tokens.get(*i).ok_or_else(|| eyre::eyre!("missing immediate for opcode {ctx}"))?;
606    *i += 1;
607    match &tok.kind {
608        TokenKind::Number(n) => Ok(Imm::Number(*n)),
609        TokenKind::LabelRef => Ok(Imm::Label(tok.src)),
610        _ => eyre::bail!("expected immediate for {ctx}, got {tok:?}"),
611    }
612}
613
614// ———————————————————————————————————————————————————————————————————————
615// Layout and emit
616// ———————————————————————————————————————————————————————————————————————
617
618/// Encode a U256 as big-endian bytes with optional fixed size.
619fn encode_imm(num: U256, size: Option<u8>) -> Result<Vec<u8>> {
620    let mut bytes = num.to_be_bytes_trimmed_vec();
621    if let Some(size) = size {
622        debug_assert!(size <= 32);
623        match bytes.len().cmp(&(size as usize)) {
624            Ordering::Less => {
625                let extend = size as usize - bytes.len();
626                bytes.splice(0..0, std::iter::repeat_n(0, extend));
627            }
628            Ordering::Equal => {}
629            Ordering::Greater => {
630                eyre::bail!("expected at most {size} immediate bytes, got {}", bytes.len());
631            }
632        }
633    }
634    debug_assert!(bytes.len() <= 32);
635    Ok(bytes)
636}
637
638/// Compute the minimum push width for a value (0 for zero, 1 for 1..=0xff, etc.).
639fn min_push_width(val: usize) -> u8 {
640    if val == 0 {
641        0
642    } else {
643        let bits = usize::BITS - val.leading_zeros();
644        bits.div_ceil(8) as u8
645    }
646}
647
648/// Layout items with label resolution (fixed-point for auto-sized label pushes) and emit bytecode.
649fn layout_and_emit(items: &[Item<'_>]) -> Result<Vec<u8>> {
650    let mut auto_label_indices = Vec::new();
651    let mut has_any_label = false;
652
653    for (i, item) in items.iter().enumerate() {
654        match item {
655            Item::Label(_) => has_any_label = true,
656            Item::Inst(inst) => {
657                if matches!(inst.imm, Some(Imm::Label(_))) {
658                    has_any_label = true;
659                    if matches!(inst.push_kind, PushKind::Auto) {
660                        auto_label_indices.push(i);
661                    }
662                }
663            }
664        }
665    }
666
667    // If no labels at all, just emit directly.
668    if !has_any_label {
669        let mut code = Vec::with_capacity(32);
670        for item in items {
671            if let Item::Inst(inst) = item {
672                emit_inst_no_labels(inst, &mut code)?;
673            }
674        }
675        return Ok(code);
676    }
677
678    // Fixed-point layout: auto-push widths start at 0 and grow monotonically.
679    let mut auto_widths = vec![0u8; items.len()];
680    let mut label_pcs = HashMap::<&str, usize>::default();
681
682    loop {
683        label_pcs.clear();
684        let mut pc = 0usize;
685        for (i, item) in items.iter().enumerate() {
686            match item {
687                Item::Label(name) => {
688                    label_pcs.insert(name, pc);
689                }
690                Item::Inst(inst) => {
691                    pc += 1;
692                    match &inst.push_kind {
693                        PushKind::None => {}
694                        PushKind::Fixed(n) => pc += *n as usize,
695                        PushKind::Auto => {
696                            let width = match &inst.imm {
697                                Some(Imm::Label(_)) => auto_widths[i],
698                                Some(Imm::Number(n)) => {
699                                    let bytes = n.to_be_bytes_trimmed_vec();
700                                    bytes.len() as u8
701                                }
702                                None => 0,
703                            };
704                            pc += width as usize;
705                        }
706                    }
707                }
708            }
709        }
710
711        let mut changed = false;
712        for &i in &auto_label_indices {
713            if let Item::Inst(inst) = &items[i]
714                && let Some(Imm::Label(name)) = &inst.imm
715            {
716                let target_pc =
717                    *label_pcs.get(name).ok_or_else(|| eyre::eyre!("undefined label: {name:?}"))?;
718                let needed = min_push_width(target_pc);
719                if needed > auto_widths[i] {
720                    auto_widths[i] = needed;
721                    changed = true;
722                }
723            }
724        }
725
726        if !changed {
727            break;
728        }
729    }
730
731    // Final emit.
732    let mut code = Vec::with_capacity(64);
733    for (i, item) in items.iter().enumerate() {
734        if let Item::Inst(inst) = item {
735            match &inst.push_kind {
736                PushKind::None => {
737                    code.push(inst.opcode);
738                }
739                PushKind::Fixed(size) => {
740                    code.push(inst.opcode);
741                    let val = resolve_imm(inst.imm.as_ref().unwrap(), &label_pcs)?;
742                    let bytes = encode_imm(val, Some(*size))?;
743                    code.extend_from_slice(&bytes);
744                }
745                PushKind::Auto => {
746                    let val = resolve_imm(inst.imm.as_ref().unwrap(), &label_pcs)?;
747                    let width = match &inst.imm {
748                        Some(Imm::Label(_)) => auto_widths[i],
749                        _ => {
750                            let bytes = val.to_be_bytes_trimmed_vec();
751                            bytes.len() as u8
752                        }
753                    };
754                    let push0 = OpCode::PUSH0.get();
755                    code.push(push0 + width);
756                    if width > 0 {
757                        let bytes = encode_imm(val, Some(width))?;
758                        code.extend_from_slice(&bytes);
759                    }
760                }
761            }
762        }
763    }
764
765    Ok(code)
766}
767
768/// Resolve an immediate value, substituting label PCs.
769fn resolve_imm(imm: &Imm<'_>, label_pcs: &HashMap<&str, usize>) -> Result<U256> {
770    match imm {
771        Imm::Number(n) => Ok(*n),
772        Imm::Label(name) => {
773            let pc = label_pcs.get(name).ok_or_else(|| eyre::eyre!("undefined label: {name:?}"))?;
774            Ok(U256::from(*pc))
775        }
776    }
777}
778
779/// Emit a single instruction (no-label fast path).
780fn emit_inst_no_labels(inst: &Inst<'_>, code: &mut Vec<u8>) -> Result<()> {
781    match &inst.push_kind {
782        PushKind::None => {
783            code.push(inst.opcode);
784        }
785        PushKind::Fixed(size) => {
786            code.push(inst.opcode);
787            let Imm::Number(n) = inst.imm.as_ref().unwrap() else {
788                unreachable!();
789            };
790            let bytes = encode_imm(*n, Some(*size))?;
791            code.extend_from_slice(&bytes);
792        }
793        PushKind::Auto => {
794            let Imm::Number(n) = inst.imm.as_ref().unwrap() else {
795                unreachable!();
796            };
797            let bytes = encode_imm(*n, None)?;
798            let push0 = OpCode::PUSH0.get();
799            code.push(push0 + bytes.len() as u8);
800            code.extend_from_slice(&bytes);
801        }
802    }
803    Ok(())
804}
805
806#[cfg(test)]
807mod tests {
808    use super::*;
809    use revm_bytecode::opcode as op;
810
811    #[test]
812    fn basic_opcodes() {
813        let cases: &[(&str, Vec<u8>)] = &[
814            ("ADD ; ADD\n ADD", vec![op::ADD, op::ADD]),
815            ("PUSH1 0", vec![op::PUSH1, 0]),
816            ("PUSH3 0x000069", vec![op::PUSH3, 0, 0, 0x69]),
817            ("PUSH3 0x69 ; padded", vec![op::PUSH3, 0, 0, 0x69]),
818            ("PUSH 0", vec![op::PUSH0]),
819            ("PUSH 1", vec![op::PUSH1, 1]),
820            ("PUSH 2", vec![op::PUSH1, 2]),
821            ("PUSH 69", vec![op::PUSH1, 69]),
822            ("PUSH 0x2222", vec![op::PUSH2, 0x22, 0x22]),
823        ];
824        for (s, expected) in cases.iter() {
825            let code = match parse_asm(s) {
826                Ok(code) => code,
827                Err(e) => panic!("code: {s:?}\n\n err: {e}"),
828            };
829            assert_eq!(code, *expected, "{s:?}");
830        }
831    }
832
833    #[test]
834    fn label_forward_ref() {
835        let code = parse_asm(
836            "
837            PUSH %target
838            JUMP
839        target:
840            JUMPDEST
841            STOP
842        ",
843        )
844        .unwrap();
845        assert_eq!(code, vec![op::PUSH1, 3, op::JUMP, op::JUMPDEST, op::STOP]);
846    }
847
848    #[test]
849    fn label_backward_ref() {
850        let code = parse_asm(
851            "
852        target:
853            JUMPDEST
854            PUSH %target
855            JUMP
856        ",
857        )
858        .unwrap();
859        assert_eq!(code, vec![op::JUMPDEST, op::PUSH0, op::JUMP]);
860    }
861
862    #[test]
863    fn label_fixed_width() {
864        let code = parse_asm(
865            "
866            PUSH1 %target
867            JUMP
868        target:
869            JUMPDEST
870            STOP
871        ",
872        )
873        .unwrap();
874        assert_eq!(code, vec![op::PUSH1, 3, op::JUMP, op::JUMPDEST, op::STOP]);
875    }
876
877    #[test]
878    fn multiple_labels_same_pc() {
879        let code = parse_asm(
880            "
881        a:
882        b:
883            JUMPDEST
884            PUSH %a
885            PUSH %b
886            STOP
887        ",
888        )
889        .unwrap();
890        assert_eq!(code, vec![op::JUMPDEST, op::PUSH0, op::PUSH0, op::STOP]);
891    }
892
893    #[test]
894    fn dup_auto() {
895        // DUP 1..16 → DUP1..DUP16 (no immediate).
896        assert_eq!(parse_asm("DUP 1").unwrap(), vec![op::DUP1]);
897        assert_eq!(parse_asm("DUP 16").unwrap(), vec![op::DUP16]);
898        // DUP 17+ → DUPN with encoded immediate.
899        assert_eq!(parse_asm("DUP 17").unwrap(), vec![op::DUPN, 0x80]);
900        assert_eq!(parse_asm("DUP 108").unwrap(), vec![op::DUPN, 0xDB]);
901        // DUP 0 is invalid.
902        assert!(parse_asm("DUP 0").is_err());
903        // DUP 236 is out of range.
904        assert!(parse_asm("DUP 236").is_err());
905    }
906
907    #[test]
908    fn swap_auto() {
909        // SWAP 1..16 → SWAP1..SWAP16 (no immediate).
910        assert_eq!(parse_asm("SWAP 1").unwrap(), vec![op::SWAP1]);
911        assert_eq!(parse_asm("SWAP 16").unwrap(), vec![op::SWAP16]);
912        // SWAP 17+ → SWAPN with encoded immediate.
913        assert_eq!(parse_asm("SWAP 17").unwrap(), vec![op::SWAPN, 0x80]);
914        assert_eq!(parse_asm("SWAP 108").unwrap(), vec![op::SWAPN, 0xDB]);
915        // SWAP 0 is invalid.
916        assert!(parse_asm("SWAP 0").is_err());
917    }
918
919    #[test]
920    fn dupn() {
921        // Explicit DUPN only accepts 17+.
922        assert_eq!(parse_asm("DUPN 17").unwrap(), vec![op::DUPN, 0x80]);
923        assert_eq!(parse_asm("DUPN 108").unwrap(), vec![op::DUPN, 0xDB]);
924        assert!(parse_asm("DUPN 16").is_err());
925        assert!(parse_asm("DUPN 0").is_err());
926        assert!(parse_asm("DUPN 236").is_err());
927        assert!(parse_asm("DUPN abc").is_err());
928    }
929
930    #[test]
931    fn swapn() {
932        assert_eq!(parse_asm("SWAPN 17").unwrap(), vec![op::SWAPN, 0x80]);
933        assert_eq!(parse_asm("SWAPN 108").unwrap(), vec![op::SWAPN, 0xDB]);
934        assert!(parse_asm("SWAPN 16").is_err());
935        assert!(parse_asm("SWAPN 0").is_err());
936    }
937
938    #[test]
939    fn exchange() {
940        // Two separate number tokens.
941        assert_eq!(parse_asm("EXCHANGE 1 2").unwrap(), vec![op::EXCHANGE, 0x8E]);
942        assert!(parse_asm("EXCHANGE 1 14").is_ok());
943        // (2, 1) cannot be encoded.
944        assert!(parse_asm("EXCHANGE 2 1").is_err());
945        // (0, 1) is invalid (zero index).
946        assert!(parse_asm("EXCHANGE 0 1").is_err());
947        // Missing second operand.
948        assert!(parse_asm("EXCHANGE 1").is_err());
949        // Non-numeric.
950        assert!(parse_asm("EXCHANGE a b").is_err());
951    }
952
953    #[test]
954    fn slotnum() {
955        assert_eq!(parse_asm("SLOTNUM").unwrap(), vec![op::SLOTNUM]);
956    }
957
958    #[test]
959    fn undefined_label() {
960        assert!(parse_asm("PUSH %missing JUMP").is_err());
961    }
962
963    #[test]
964    fn empty_label() {
965        assert!(parse_asm(": STOP").is_err());
966    }
967
968    #[test]
969    fn empty_label_ref() {
970        assert!(parse_asm("PUSH % JUMP").is_err());
971    }
972
973    #[test]
974    fn define_macro() {
975        let code = parse_asm(
976            "
977            #define TWO PUSH 2
978            TWO
979            TWO
980            ADD
981        ",
982        )
983        .unwrap();
984        assert_eq!(code, vec![op::PUSH1, 2, op::PUSH1, 2, op::ADD]);
985    }
986
987    #[test]
988    fn builtin_ret_word() {
989        let code = parse_asm("CALLVALUE RET_WORD").unwrap();
990        assert_eq!(
991            code,
992            vec![op::CALLVALUE, op::PUSH0, op::MSTORE, op::PUSH1, 0x20, op::PUSH0, op::RETURN]
993        );
994    }
995
996    #[test]
997    fn define_override_builtin() {
998        let code = parse_asm(
999            "
1000            #define RET_WORD STOP
1001            RET_WORD
1002        ",
1003        )
1004        .unwrap();
1005        assert_eq!(code, vec![op::STOP]);
1006    }
1007
1008    #[test]
1009    fn define_with_args() {
1010        let code = parse_asm(
1011            "
1012            #define PUSH_TWO(a, b) PUSH $a PUSH $b
1013            PUSH_TWO(1, 2)
1014            ADD
1015        ",
1016        )
1017        .unwrap();
1018        assert_eq!(code, vec![op::PUSH1, 1, op::PUSH1, 2, op::ADD]);
1019    }
1020
1021    #[test]
1022    fn define_single_arg() {
1023        let code = parse_asm(
1024            "
1025            #define PUSH_AND_STORE(val) PUSH $val PUSH0 MSTORE
1026            PUSH_AND_STORE(0x42)
1027        ",
1028        )
1029        .unwrap();
1030        assert_eq!(code, vec![op::PUSH1, 0x42, op::PUSH0, op::MSTORE]);
1031    }
1032
1033    #[test]
1034    fn define_missing_args() {
1035        assert!(
1036            parse_asm(
1037                "
1038            #define FOO(a) PUSH $a
1039            FOO
1040        "
1041            )
1042            .is_err()
1043        );
1044    }
1045
1046    #[test]
1047    fn define_wrong_arg_count() {
1048        assert!(
1049            parse_asm(
1050                "
1051            #define FOO(a, b) PUSH $a PUSH $b
1052            FOO(1)
1053        "
1054            )
1055            .is_err()
1056        );
1057    }
1058
1059    #[test]
1060    fn define_empty_name() {
1061        assert!(parse_asm("#define").is_err());
1062        assert!(parse_asm("#define  ").is_err());
1063    }
1064
1065    #[test]
1066    fn define_invalid_name() {
1067        // `-` is tokenized as Unknown, so it can't be a macro name.
1068        assert!(parse_asm("#define - STOP").is_err());
1069        // `#define` with no name at all.
1070        assert!(parse_asm("#define (a) STOP").is_err());
1071    }
1072
1073    #[test]
1074    fn define_no_space_after_keyword() {
1075        // `#defineFOO` should not be treated as a directive.
1076        assert!(parse_asm("#defineFOO STOP").is_err());
1077    }
1078
1079    #[test]
1080    fn define_zero_arg_fn() {
1081        let code = parse_asm(
1082            "
1083            #define NOP() ADD
1084            NOP()
1085        ",
1086        )
1087        .unwrap();
1088        assert_eq!(code, vec![op::ADD]);
1089    }
1090
1091    #[test]
1092    fn define_zero_arg_fn_bare_is_error() {
1093        // Function-like macro requires parentheses.
1094        assert!(
1095            parse_asm(
1096                "
1097            #define NOP() ADD
1098            NOP
1099        "
1100            )
1101            .is_err()
1102        );
1103    }
1104
1105    #[test]
1106    fn define_zero_arg_fn_with_args_is_error() {
1107        assert!(
1108            parse_asm(
1109                "
1110            #define NOP() ADD
1111            NOP(1)
1112        "
1113            )
1114            .is_err()
1115        );
1116    }
1117
1118    #[test]
1119    fn define_malformed_param_list() {
1120        // Missing closing paren.
1121        assert!(parse_asm("#define FOO(a STOP").is_err());
1122        // Missing comma between params.
1123        assert!(parse_asm("#define FOO(a b) STOP").is_err());
1124        // Number as param name.
1125        assert!(parse_asm("#define FOO(1) STOP").is_err());
1126        // Trailing comma.
1127        assert!(parse_asm("#define FOO(a,) STOP").is_err());
1128    }
1129
1130    #[test]
1131    fn define_space_before_paren_is_object_like() {
1132        // `#define FOO (a) STOP` — space before `(` makes it object-like with body `(a) STOP`.
1133        // Using FOO should expand to `(a) STOP`, and `(` is not a valid opcode.
1134        assert!(
1135            parse_asm(
1136                "
1137            #define FOO (a) STOP
1138            FOO
1139        "
1140            )
1141            .is_err()
1142        );
1143    }
1144}