Skip to content

Commit bf38bb1

Browse files
committed
new lint: chars_enumerate_for_byte_indices
1 parent 6a281e9 commit bf38bb1

7 files changed

+426
-0
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -5352,6 +5352,7 @@ Released 2018-09-13
53525352
[`cast_slice_from_raw_parts`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_from_raw_parts
53535353
[`cfg_not_test`]: https://rust-lang.github.io/rust-clippy/master/index.html#cfg_not_test
53545354
[`char_lit_as_u8`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_lit_as_u8
5355+
[`chars_enumerate_for_byte_indices`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_enumerate_for_byte_indices
53555356
[`chars_last_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_last_cmp
53565357
[`chars_next_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_next_cmp
53575358
[`checked_conversions`]: https://rust-lang.github.io/rust-clippy/master/index.html#checked_conversions

clippy_lints/src/declared_lints.rs

+1
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ pub static LINTS: &[&crate::LintInfo] = &[
273273
crate::literal_representation::MISTYPED_LITERAL_SUFFIXES_INFO,
274274
crate::literal_representation::UNREADABLE_LITERAL_INFO,
275275
crate::literal_representation::UNUSUAL_BYTE_GROUPINGS_INFO,
276+
crate::loops::CHARS_ENUMERATE_FOR_BYTE_INDICES_INFO,
276277
crate::loops::EMPTY_LOOP_INFO,
277278
crate::loops::EXPLICIT_COUNTER_LOOP_INFO,
278279
crate::loops::EXPLICIT_INTO_ITER_LOOP_INFO,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
use std::ops::ControlFlow;
2+
3+
use clippy_utils::diagnostics::span_lint_hir_and_then;
4+
use clippy_utils::ty::is_type_lang_item;
5+
use clippy_utils::visitors::for_each_expr;
6+
use clippy_utils::{eq_expr_value, higher, path_to_local_id};
7+
use rustc_errors::{Applicability, MultiSpan};
8+
use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind};
9+
use rustc_lint::LateContext;
10+
use rustc_middle::ty::Ty;
11+
use rustc_span::{Span, sym};
12+
13+
use super::CHARS_ENUMERATE_FOR_BYTE_INDICES;
14+
15+
// The list of `str` methods we want to lint that have a `usize` argument representing a byte index.
16+
// Note: `String` also has methods that work with byte indices,
17+
// but they all take `&mut self` and aren't worth considering since the user couldn't have called
18+
// them while the chars iterator is live anyway.
19+
const BYTE_INDEX_METHODS: &[&str] = &[
20+
"is_char_boundary",
21+
"floor_char_boundary",
22+
"ceil_char_boundary",
23+
"get",
24+
"index",
25+
"index_mut",
26+
"get_mut",
27+
"get_unchecked",
28+
"get_unchecked_mut",
29+
"slice_unchecked",
30+
"slice_mut_unchecked",
31+
"split_at",
32+
"split_at_mut",
33+
"split_at_checked",
34+
"split_at_mut_checked",
35+
];
36+
37+
const CONTINUE: ControlFlow<!, ()> = ControlFlow::Continue(());
38+
39+
pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) {
40+
if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind
41+
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id)
42+
&& cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id)
43+
&& let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind
44+
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id)
45+
&& cx.tcx.is_diagnostic_item(sym::str_chars, method_id)
46+
{
47+
if let PatKind::Tuple([pat, _], _) = pat.kind
48+
&& let PatKind::Binding(_, binding_id, ..) = pat.kind
49+
{
50+
// Destructured iterator element `(idx, _)`, look for uses of the binding
51+
for_each_expr(cx, body, |expr| {
52+
if path_to_local_id(expr, binding_id) {
53+
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
54+
}
55+
CONTINUE
56+
});
57+
} else if let PatKind::Binding(_, binding_id, ..) = pat.kind {
58+
// Bound as a tuple, look for `tup.0`
59+
for_each_expr(cx, body, |expr| {
60+
if let ExprKind::Field(e, field) = expr.kind
61+
&& path_to_local_id(e, binding_id)
62+
&& field.name == sym::integer(0)
63+
{
64+
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
65+
}
66+
CONTINUE
67+
});
68+
}
69+
}
70+
}
71+
72+
fn check_index_usage<'tcx>(
73+
cx: &LateContext<'tcx>,
74+
expr: &'tcx Expr<'tcx>,
75+
pat: &Pat<'_>,
76+
enumerate_span: Span,
77+
chars_span: Span,
78+
chars_recv: &Expr<'_>,
79+
) {
80+
let Some(parent_expr) = index_consumed_at(cx, expr) else {
81+
return;
82+
};
83+
84+
let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String);
85+
let message = match parent_expr.kind {
86+
ExprKind::MethodCall(segment, recv, ..)
87+
if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str()
88+
&& BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str())
89+
&& eq_expr_value(cx, chars_recv, recv) =>
90+
{
91+
"passing a character position to a method that expects a byte index"
92+
},
93+
ExprKind::Index(target, ..)
94+
if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs())
95+
&& eq_expr_value(cx, chars_recv, target) =>
96+
{
97+
"indexing into a string with a character position where a byte index is expected"
98+
},
99+
_ => return,
100+
};
101+
102+
span_lint_hir_and_then(
103+
cx,
104+
CHARS_ENUMERATE_FOR_BYTE_INDICES,
105+
expr.hir_id,
106+
expr.span,
107+
message,
108+
|diag| {
109+
diag.note("a character can take up more than one byte, so they are not interchangeable")
110+
.span_note(
111+
MultiSpan::from_spans(vec![pat.span, enumerate_span]),
112+
"position comes from the enumerate iterator",
113+
)
114+
.span_suggestion_verbose(
115+
chars_span.to(enumerate_span),
116+
"consider using `.char_indices()` instead",
117+
"char_indices()",
118+
Applicability::MaybeIncorrect,
119+
);
120+
},
121+
);
122+
}
123+
124+
/// Returns the expression which ultimately consumes the index.
125+
/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`,
126+
/// but for `.get(..idx)` we want to consider the method call the consuming expression,
127+
/// which requires skipping past the range expression.
128+
fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> {
129+
for (_, node) in cx.tcx.hir().parent_iter(expr.hir_id) {
130+
match node {
131+
Node::Expr(expr) if higher::Range::hir(expr).is_some() => {},
132+
Node::ExprField(_) => {},
133+
Node::Expr(expr) => return Some(expr),
134+
_ => break,
135+
}
136+
}
137+
None
138+
}

clippy_lints/src/loops/mod.rs

+45
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod chars_enumerate_for_byte_indices;
12
mod empty_loop;
23
mod explicit_counter_loop;
34
mod explicit_into_iter_loop;
@@ -714,6 +715,48 @@ declare_clippy_lint! {
714715
"possibly unintended infinite loop"
715716
}
716717

718+
declare_clippy_lint! {
719+
/// ### What it does
720+
/// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected,
721+
/// such as an argument to a specific `str` method or indexing into a `str` or `String`.
722+
///
723+
/// ### Why is this bad?
724+
/// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes,
725+
/// so a character position does not necessarily have the same byte index at which the character is stored.
726+
/// Thus, using the character position where a byte index is expected can unexpectedly return wrong values
727+
/// or panic when the string consists of multibyte characters.
728+
///
729+
/// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1.
730+
/// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character.
731+
///
732+
/// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices.
733+
///
734+
/// This pattern is technically fine if the strings are known to only use the ASCII subset,
735+
/// but there is also no downside to just using `.char_indices()` directly.
736+
///
737+
/// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html)
738+
/// which goes into this in more detail.
739+
///
740+
/// ### Example
741+
/// ```no_run
742+
/// # let s = "...";
743+
/// for (idx, c) in s.chars().enumerate() {
744+
/// let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters
745+
/// }
746+
/// ```
747+
/// Use instead:
748+
/// ```no_run
749+
/// # let s = "...";
750+
/// for (idx, c) in s.char_indices() {
751+
/// let _ = s[idx..];
752+
/// }
753+
/// ```
754+
#[clippy::version = "1.83.0"]
755+
pub CHARS_ENUMERATE_FOR_BYTE_INDICES,
756+
correctness,
757+
"using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected"
758+
}
759+
717760
pub struct Loops {
718761
msrv: Msrv,
719762
enforce_iter_loop_reborrow: bool,
@@ -750,6 +793,7 @@ impl_lint_pass!(Loops => [
750793
MANUAL_WHILE_LET_SOME,
751794
UNUSED_ENUMERATE_INDEX,
752795
INFINITE_LOOP,
796+
CHARS_ENUMERATE_FOR_BYTE_INDICES,
753797
]);
754798

755799
impl<'tcx> LateLintPass<'tcx> for Loops {
@@ -834,6 +878,7 @@ impl Loops {
834878
manual_flatten::check(cx, pat, arg, body, span);
835879
manual_find::check(cx, pat, arg, body, span, expr);
836880
unused_enumerate_index::check(cx, pat, arg, body);
881+
chars_enumerate_for_byte_indices::check(cx, pat, arg, body);
837882
}
838883

839884
fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#![feature(round_char_boundary)]
2+
#![warn(clippy::chars_enumerate_for_byte_indices)]
3+
4+
trait StrExt {
5+
fn use_index(&self, _: usize);
6+
}
7+
impl StrExt for str {
8+
fn use_index(&self, _: usize) {}
9+
}
10+
11+
fn bad(prim: &str, string: String) {
12+
for (idx, _) in prim.char_indices() {
13+
let _ = prim[..idx];
14+
//~^ chars_enumerate_for_byte_indices
15+
prim.split_at(idx);
16+
//~^ chars_enumerate_for_byte_indices
17+
18+
// This won't panic, but it can still return a wrong substring
19+
let _ = prim[..prim.floor_char_boundary(idx)];
20+
//~^ chars_enumerate_for_byte_indices
21+
22+
// can't use #[expect] here because the .fixed file will still have the attribute and create an
23+
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
24+
#[allow(clippy::chars_enumerate_for_byte_indices)]
25+
let _ = prim[..idx];
26+
}
27+
28+
for c in prim.char_indices() {
29+
let _ = prim[..c.0];
30+
//~^ chars_enumerate_for_byte_indices
31+
prim.split_at(c.0);
32+
//~^ chars_enumerate_for_byte_indices
33+
}
34+
35+
for (idx, _) in string.char_indices() {
36+
let _ = string[..idx];
37+
//~^ chars_enumerate_for_byte_indices
38+
string.split_at(idx);
39+
//~^ chars_enumerate_for_byte_indices
40+
}
41+
}
42+
43+
fn good(prim: &str, prim2: &str) {
44+
for (idx, _) in prim.chars().enumerate() {
45+
// Indexing into a different string
46+
let _ = prim2[..idx];
47+
48+
// Unknown use
49+
std::hint::black_box(idx);
50+
51+
// Method call to user defined extension trait
52+
prim.use_index(idx);
53+
54+
// str method taking a usize that doesn't represent a byte index
55+
prim.splitn(idx, prim2);
56+
}
57+
}
58+
59+
fn main() {}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#![feature(round_char_boundary)]
2+
#![warn(clippy::chars_enumerate_for_byte_indices)]
3+
4+
trait StrExt {
5+
fn use_index(&self, _: usize);
6+
}
7+
impl StrExt for str {
8+
fn use_index(&self, _: usize) {}
9+
}
10+
11+
fn bad(prim: &str, string: String) {
12+
for (idx, _) in prim.chars().enumerate() {
13+
let _ = prim[..idx];
14+
//~^ chars_enumerate_for_byte_indices
15+
prim.split_at(idx);
16+
//~^ chars_enumerate_for_byte_indices
17+
18+
// This won't panic, but it can still return a wrong substring
19+
let _ = prim[..prim.floor_char_boundary(idx)];
20+
//~^ chars_enumerate_for_byte_indices
21+
22+
// can't use #[expect] here because the .fixed file will still have the attribute and create an
23+
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
24+
#[allow(clippy::chars_enumerate_for_byte_indices)]
25+
let _ = prim[..idx];
26+
}
27+
28+
for c in prim.chars().enumerate() {
29+
let _ = prim[..c.0];
30+
//~^ chars_enumerate_for_byte_indices
31+
prim.split_at(c.0);
32+
//~^ chars_enumerate_for_byte_indices
33+
}
34+
35+
for (idx, _) in string.chars().enumerate() {
36+
let _ = string[..idx];
37+
//~^ chars_enumerate_for_byte_indices
38+
string.split_at(idx);
39+
//~^ chars_enumerate_for_byte_indices
40+
}
41+
}
42+
43+
fn good(prim: &str, prim2: &str) {
44+
for (idx, _) in prim.chars().enumerate() {
45+
// Indexing into a different string
46+
let _ = prim2[..idx];
47+
48+
// Unknown use
49+
std::hint::black_box(idx);
50+
51+
// Method call to user defined extension trait
52+
prim.use_index(idx);
53+
54+
// str method taking a usize that doesn't represent a byte index
55+
prim.splitn(idx, prim2);
56+
}
57+
}
58+
59+
fn main() {}

0 commit comments

Comments
 (0)