diff --git a/c2rust-ast-exporter/src/AstExporter.cpp b/c2rust-ast-exporter/src/AstExporter.cpp index 16396f2948..a81634ce65 100644 --- a/c2rust-ast-exporter/src/AstExporter.cpp +++ b/c2rust-ast-exporter/src/AstExporter.cpp @@ -54,7 +54,8 @@ using clang::ASTContext; using clang::QualType; using std::string; -namespace { +namespace { // for local definitions, preferred to making each `static` + // Encode a string object assuming that it is valid UTF-8 encoded text void cbor_encode_string(CborEncoder *encoder, const std::string &str) { auto ptr = str.data(); @@ -105,7 +106,6 @@ std::optional getIntegerConstantExpr(const Expr &E, return E.getIntegerConstantExpr(Ctx); } #endif // CLANG_VERSION_MAJOR -} // namespace DiagnosticBuilder getDiagBuilder(ASTContext *Context, SourceLocation Loc, @@ -154,6 +154,20 @@ void printDiag(ASTContext *Context, DiagnosticsEngine::Level Lvl, std::string Me printDiag(Context, Lvl, Message, loc, t->getSourceRange()); } +// Extend the source range to include its entire final token. Clang source +// ranges are stored as ranges of tokens, and their end will point to the +// first byte of the final token, rather than its last byte. This converts +// the range to a character range and extends its endpoint to the final +// character of the final token. +void expandSpanToFinalChar(SourceRange& span, ASTContext* Context) { + auto &Mgr = Context->getSourceManager(); + auto charRange = clang::CharSourceRange::getCharRange(span); + charRange.setEnd(clang::Lexer::getLocForEndOfToken(span.getEnd(), 0, Mgr, Context->getLangOpts())); + span = charRange.getAsRange(); +} + +} // namespace + class TranslateASTVisitor; class TypeEncoder final : public TypeVisitor { @@ -810,7 +824,9 @@ class TranslateASTVisitor final // See https://github.com/immunant/c2rust/issues/1124 bool isRValue = ast->getValueKind() == VK_PRValue; #endif - encode_entry_raw(ast, tag, ast->getSourceRange(), ty, isRValue, isVaList, + auto span = ast->getSourceRange(); + expandSpanToFinalChar(span, Context); + encode_entry_raw(ast, tag, span, ty, isRValue, isVaList, encodeMacroExpansions, childIds, extra); typeEncoder.VisitQualTypeOf(ty, ast); } @@ -822,7 +838,9 @@ class TranslateASTVisitor final auto rvalue = false; auto isVaList = false; auto encodeMacroExpansions = false; - encode_entry_raw(ast, tag, ast->getSourceRange(), s, rvalue, isVaList, + auto span = ast->getSourceRange(); + expandSpanToFinalChar(span, Context); + encode_entry_raw(ast, tag, span, s, rvalue, isVaList, encodeMacroExpansions, childIds, extra); } @@ -832,7 +850,9 @@ class TranslateASTVisitor final std::function extra = [](CborEncoder *) {}) { auto rvalue = false; auto encodeMacroExpansions = false; - encode_entry_raw(ast, tag, ast->getSourceRange(), T, rvalue, + auto span = ast->getSourceRange(); + expandSpanToFinalChar(span, Context); + encode_entry_raw(ast, tag, span, T, rvalue, isVaList(ast, T), encodeMacroExpansions, childIds, extra); } @@ -845,6 +865,7 @@ class TranslateASTVisitor final std::function extra = [](CborEncoder *) {}) { auto rvalue = false; auto encodeMacroExpansions = false; + expandSpanToFinalChar(loc, Context); encode_entry_raw(ast, tag, loc, T, rvalue, isVaList(ast, T), encodeMacroExpansions, childIds, extra); } @@ -973,6 +994,8 @@ class TranslateASTVisitor final std::vector childIds; auto range = SourceRange(Mac->getDefinitionLoc(), Mac->getDefinitionEndLoc()); + // Extend the range to include the entire final token. + expandSpanToFinalChar(range, Context); encode_entry_raw(Mac, tag, range, QualType(), false, false, false, childIds, [Name](CborEncoder *local) { cbor_encode_string(local, Name.str()); @@ -1973,7 +1996,7 @@ class TranslateASTVisitor final // } // Use the parameters from the function declaration - // the defines the body, if one exists. + // that defines the body, if one exists. const FunctionDecl *paramsFD = FD; auto body = FD->getBody(paramsFD); // replaces its argument if body exists @@ -2084,7 +2107,7 @@ class TranslateASTVisitor final if (!VD->isCanonicalDecl() && !VD->isExternC()) { // Emit non-canonical decl so we have a placeholder to attach comments to std::vector childIds = {VD->getCanonicalDecl()}; - encode_entry(VD, TagNonCanonicalDecl, VD->getLocation(), childIds, VD->getType()); + encode_entry(VD, TagNonCanonicalDecl, VD->getSourceRange(), childIds, VD->getType()); typeEncoder.VisitQualTypeOf(VD->getType(), VD); return true; } @@ -2119,7 +2142,7 @@ class TranslateASTVisitor final // type auto T = def->getType(); - auto loc = is_defn ? def->getLocation() : VD->getLocation(); + auto loc = is_defn ? def->getSourceRange() : VD->getSourceRange(); encode_entry( VD, TagVarDecl, loc, childIds, T, @@ -2183,7 +2206,7 @@ class TranslateASTVisitor final // Attributes may also be attached to the non-canonical declaration so // we emit them too. std::vector childIds = {D->getCanonicalDecl()}; - encode_entry(D, TagNonCanonicalDecl, D->getLocation(), childIds, QualType(), + encode_entry(D, TagNonCanonicalDecl, D->getSourceRange(), childIds, QualType(), [D](CborEncoder *local) { // 1. Attributes stored as an array of attribute names CborEncoder attrs; @@ -2203,7 +2226,7 @@ class TranslateASTVisitor final auto t = D->getTypeForDecl(); - auto loc = D->getLocation(); + auto loc = D->getSourceRange(); std::vector childIds; if (def) { for (auto decl : def->decls()) { @@ -2219,7 +2242,7 @@ class TranslateASTVisitor final // Since the RecordDecl D isn't the complete definition, // the actual location should be given. This should handle opaque // types. - loc = def->getLocation(); + loc = def->getSourceRange(); const ASTRecordLayout &layout = this->Context->getASTRecordLayout(def); @@ -2309,7 +2332,7 @@ class TranslateASTVisitor final if (!D->isCanonicalDecl()) { // Emit non-canonical decl so we have a placeholder to attach comments to std::vector childIds = {D->getCanonicalDecl()}; - encode_entry(D, TagNonCanonicalDecl, D->getLocation(), childIds, QualType()); + encode_entry(D, TagNonCanonicalDecl, D->getSourceRange(), childIds, QualType()); return true; } @@ -2335,7 +2358,7 @@ class TranslateASTVisitor final if (!D->isCanonicalDecl()) { // Emit non-canonical decl so we have a placeholder to attach comments to std::vector childIds = {D->getCanonicalDecl()}; - encode_entry(D, TagNonCanonicalDecl, D->getLocation(), childIds, D->getType()); + encode_entry(D, TagNonCanonicalDecl, D->getSourceRange(), childIds, D->getType()); typeEncoder.VisitQualTypeOf(D->getType(), D); return true; } @@ -2418,7 +2441,7 @@ class TranslateASTVisitor final if (!D->isCanonicalDecl()) { // Emit non-canonical decl so we have a placeholder to attach comments to std::vector childIds = {D->getCanonicalDecl()}; - encode_entry(D, TagNonCanonicalDecl, D->getLocation(), childIds, typeForDecl); + encode_entry(D, TagNonCanonicalDecl, D->getSourceRange(), childIds, typeForDecl); typeEncoder.VisitQualTypeOf(typeForDecl, D); return true; } diff --git a/c2rust-transpile/src/c_ast/mod.rs b/c2rust-transpile/src/c_ast/mod.rs index b4ea64a607..97a2271f1d 100644 --- a/c2rust-transpile/src/c_ast/mod.rs +++ b/c2rust-transpile/src/c_ast/mod.rs @@ -299,6 +299,66 @@ impl TypedAstContext { } } + /// Construct a map from top-level decls in the main file to their source ranges. + pub fn top_decl_locs(&self) -> IndexMap { + let mut name_loc_map = IndexMap::new(); + let mut prev_end_loc = SrcLoc { + fileid: 0, + line: 0, + column: 0, + }; + // Sort decls by source location so we can reason about the possibly comment-containing gaps + // between them. + let mut decls_sorted = self.c_decls_top.clone(); + decls_sorted.sort_by_key(|decl| self.c_decls[decl].begin_loc()); + for decl_id in &decls_sorted { + let decl = &self.c_decls[decl_id]; + let begin_loc: SrcLoc = decl.begin_loc().expect("no begin loc for top-level decl"); + let end_loc: SrcLoc = decl.end_loc().expect("no end loc for top-level decl"); + + // Skip fileid 0; this is not a real file, so these source locations aren't important. + if begin_loc.fileid == 0 { + continue; + } + if begin_loc == end_loc { + log::warn!( + "zero-length source range for top-level decl; skipping. source ranges for \ + top-level decls may be incorrect.\ndecl: {decl:?}" + ); + continue; + } + + // If encountering a new file, reset end of last top-level decl. + if prev_end_loc.fileid != begin_loc.fileid { + prev_end_loc = SrcLoc { + fileid: begin_loc.fileid, + line: 1, + column: 1, + } + } + + // This definition ends before the previous one does, i.e. it is nested. + // This does not generally occur for regular definitions, e.g. variables within + // functions, because the variables will not be top-level decls. But it can occur + // for macros defined inside functions, since all macros are top-level decls! + let is_nested = end_loc < prev_end_loc; + // End of the previous decl is the start of comments pertaining to the current one. + let new_begin_loc = if is_nested { begin_loc } else { prev_end_loc }; + + // Include only decls from the main file. + if self.c_decls_top.contains(decl_id) + && self.get_source_path(decl) == Some(&self.main_file) + { + let entry = (new_begin_loc, end_loc); + name_loc_map.insert(*decl_id, entry); + } + if !is_nested { + prev_end_loc = end_loc; + } + } + name_loc_map + } + pub fn iter_decls(&self) -> indexmap::map::Iter<'_, CDeclId, CDecl> { self.c_decls.iter() } diff --git a/c2rust-transpile/src/lib.rs b/c2rust-transpile/src/lib.rs index 5a68bf21f6..19deb12cb7 100644 --- a/c2rust-transpile/src/lib.rs +++ b/c2rust-transpile/src/lib.rs @@ -72,6 +72,7 @@ pub struct TranspilerConfig { pub dump_structures: bool, pub verbose: bool, pub debug_ast_exporter: bool, + pub emit_c_decl_map: bool, // Options that control translation pub incremental_relooper: bool, @@ -599,9 +600,30 @@ fn transpile_single( } // Perform the translation - let (translated_string, pragmas, crates) = + let (translated_string, maybe_decl_map, pragmas, crates) = translator::translate(typed_context, tcfg, input_path); + if let Some(decl_map) = maybe_decl_map { + let decl_map_path = output_path.with_extension("c_decls.json"); + let file = match File::create(&decl_map_path) { + Ok(file) => file, + Err(e) => panic!( + "Unable to open file {} for writing: {}", + output_path.display(), + e + ), + }; + + match serde_json::ser::to_writer(file, &decl_map) { + Ok(()) => (), + Err(e) => panic!( + "Unable to write C declaration map to file {}: {}", + output_path.display(), + e + ), + }; + } + let mut file = match File::create(&output_path) { Ok(file) => file, Err(e) => panic!( diff --git a/c2rust-transpile/src/translator/mod.rs b/c2rust-transpile/src/translator/mod.rs index 233664068e..4d3727bd91 100644 --- a/c2rust-transpile/src/translator/mod.rs +++ b/c2rust-transpile/src/translator/mod.rs @@ -9,6 +9,7 @@ use dtoa; use failure::{err_msg, format_err, Fail}; use indexmap::indexmap; use indexmap::{IndexMap, IndexSet}; +use itertools::Itertools; use log::{error, info, trace, warn}; use proc_macro2::{Punct, Spacing::*, Span, TokenStream, TokenTree}; use syn::spanned::Spanned as _; @@ -454,11 +455,13 @@ pub fn translate_failure(tcfg: &TranspilerConfig, msg: &str) { } } +type DeclMap = IndexMap; + pub fn translate( ast_context: TypedAstContext, tcfg: &TranspilerConfig, main_file: &Path, -) -> (String, PragmaVec, CrateSet) { +) -> (String, Option, PragmaVec, CrateSet) { let mut t = Translation::new(ast_context, tcfg, main_file); let ctx = ExprContext { used: true, @@ -474,6 +477,14 @@ pub fn translate( { t.locate_comments(); + // Compute source ranges of top decls before pruning any, because pruned + // decls may help inform the ranges of kept ones. + let decl_source_ranges = if tcfg.emit_c_decl_map { + Some(t.ast_context.top_decl_locs()) + } else { + None + }; + // Headers often pull in declarations that are unused; // we simplify the translator output by omitting those. t.ast_context @@ -723,6 +734,89 @@ pub fn translate( }) .collect::>(); + // Generate a map from Rust items to the source code of their C declarations. + let decl_map = decl_source_ranges.and_then(|decl_source_ranges| { + let mut path_to_c_source_range: HashMap<&Ident, (SrcLoc, SrcLoc)> = Default::default(); + for (decl, source_range) in decl_source_ranges { + match converted_decls.get(&decl) { + Some(ConvertedDecl::ForeignItem(item)) => { + path_to_c_source_range + .insert(foreign_item_ident_vis(&*item).unwrap().0, source_range); + } + Some(ConvertedDecl::Item(item)) => { + path_to_c_source_range.insert(item_ident(&*item).unwrap(), source_range); + } + Some(ConvertedDecl::Items(items)) => { + for item in items { + path_to_c_source_range + .insert(item_ident(&*item).unwrap(), source_range); + } + } + Some(ConvertedDecl::NoItem) => {} + None => log::warn!( + "no converted form to add to C decl map for top-level decl {decl:?}: {:?}!", + t.ast_context.get_decl(&decl) + ), + } + } + + let file_content = + std::fs::read(&t.ast_context.get_file_path(t.main_file).unwrap()).unwrap(); + let line_end_offsets = //memchr::memchr_iter(file_content, '\n') + file_content.iter().positions(|c| *c == b'\n') + .collect::>(); + + /// Convert a source location line/column into a byte offset, given the positions of each newline in the file. + fn src_loc_to_byte_offset(line_end_offsets: &[usize], loc: SrcLoc) -> usize { + let line_offset = loc + .line + .checked_sub(2) // lines are 1-indexed, and we want end of the previous line + .and_then(|line| line_end_offsets.get(line as usize)) + .map(|x| x + 1) // increment end of the prev line to find start of this one + .unwrap_or(0); // if we indexed out of bounds (e.g. for line 1), start at byte 0 + line_offset + (loc.column as usize).saturating_sub(1) + } + + // Slice into the source file, fixing up the ends to account for Clang AST quirks. + let slice_decl_with_fixups = |begin: SrcLoc, end: SrcLoc| -> &[u8] { + assert!(begin.line <= end.line, "{} <= {}", begin.line, end.line); + let mut begin_offset = src_loc_to_byte_offset(&line_end_offsets, begin); + let mut end_offset = src_loc_to_byte_offset(&line_end_offsets, end); + assert!(begin_offset <= end_offset); + const VT: u8 = 11; + // Skip whitespace and any trailing semicolons after the previous decl. + while let Some(b'\t' | b'\n' | &VT | b'\r' | b' ' | b';') = + file_content.get(begin_offset) + { + begin_offset += 1; + } + + assert!(begin_offset <= end_offset); + + // Extend to include a single trailing semicolon if this decl is not a block + // (e.g., a variable declaration). + if file_content.get(end_offset - 1) != Some(&b'}') + && file_content.get(end_offset) == Some(&b';') + { + end_offset += 1; + } + + assert!(begin_offset <= end_offset); + + &file_content[begin_offset..end_offset] + }; + + let item_path_to_c_source: IndexMap<_, _> = path_to_c_source_range + .into_iter() + .map(|(ident, (begin, end))| { + let path = ident.to_string(); + let c_src = std::str::from_utf8(slice_decl_with_fixups(begin, end)).unwrap(); + (path, c_src.to_owned()) + }) + .collect(); + Some(item_path_to_c_source) + }); + t.ast_context.sort_top_decls_for_emitting(); for top_id in &t.ast_context.c_decls_top { @@ -887,7 +981,7 @@ pub fn translate( .copied() .collect(); - (translation, pragmas, crates) + (translation, decl_map, pragmas, crates) } } diff --git a/c2rust-transpile/tests/snapshots.rs b/c2rust-transpile/tests/snapshots.rs index 48a7f858b0..a811c8ea77 100644 --- a/c2rust-transpile/tests/snapshots.rs +++ b/c2rust-transpile/tests/snapshots.rs @@ -16,6 +16,7 @@ fn config() -> TranspilerConfig { dump_structures: false, verbose: false, debug_ast_exporter: false, + emit_c_decl_map: false, incremental_relooper: true, fail_on_multiple: false, filter: None, diff --git a/c2rust/src/bin/c2rust-transpile.rs b/c2rust/src/bin/c2rust-transpile.rs index 890664447b..e420613e89 100644 --- a/c2rust/src/bin/c2rust-transpile.rs +++ b/c2rust/src/bin/c2rust-transpile.rs @@ -36,6 +36,11 @@ struct Args { #[clap(long)] debug_ast_exporter: bool, + /// Write map of C decls corresponding to each translated Rust item + /// alongside the transpiled output with the extension `.c_decls.json`. + #[clap(long)] + emit_c_decl_map: bool, + /// Verbose mode #[clap(short = 'v', long)] verbose: bool, @@ -205,6 +210,7 @@ fn main() { dump_cfg_liveness: args.dump_cfgs_liveness, dump_structures: args.dump_structures, debug_ast_exporter: args.debug_ast_exporter, + emit_c_decl_map: args.emit_c_decl_map, verbose: args.verbose, incremental_relooper: !args.no_incremental_relooper,