Skip to content

Commit 3165a63

Browse files
committed
support parsing double-quote escaped by a preceding double-quote
1 parent e7cb357 commit 3165a63

1 file changed

Lines changed: 111 additions & 54 deletions

File tree

src/utils/row_split.rs

Lines changed: 111 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,18 @@
1-
use std::str::CharIndices;
1+
// CSV row split module that supports:
2+
// 1. double-quoted field
3+
// 2. comma in a double-quoted field
4+
// 3. double-quotes in a field escaped by a backslash \
5+
// 4. double-quotes in a field escaped by a preceding double-quotes as discussed in
6+
// https://stackoverflow.com/questions/17808511/how-to-properly-escape-a-double-quote-in-csv
7+
8+
// worked for examples:
9+
// v1,v2,v3
10+
// "v1","v2","v3"
11+
// "v1",v2,v3
12+
// "Charles \"Pretty Boy\" Floyd","1 Short St, Smallville"
13+
// "Charles ""Pretty Boy"" Floyd","1 Short St, Smallville"
14+
15+
use std::{iter::Peekable, str::CharIndices};
216

317
#[derive(Debug)]
418
pub struct CsvRow<'a> {
@@ -8,14 +22,15 @@ pub struct CsvRow<'a> {
822
#[derive(Debug)]
923
pub struct CsvRowSplit<'a> {
1024
row: &'a str,
11-
char_indices: CharIndices<'a>,
25+
char_indices: Peekable<CharIndices<'a>>,
1226
sep: char,
1327
quote: char,
14-
done: bool,
15-
field_start: usize,
16-
field_end_shift: usize,
17-
in_quoted_field: bool,
18-
has_separator: bool,
28+
parse_done: bool,
29+
field_start_index: usize,
30+
field_is_quoted: bool,
31+
field_has_separator: bool,
32+
cur_in_quoted_field: bool,
33+
cur_is_field_start: bool,
1934
}
2035

2136
impl<'a> CsvRow<'a> {
@@ -26,59 +41,77 @@ impl<'a> CsvRow<'a> {
2641
pub fn split(self, sep: char, quote: char) -> CsvRowSplit<'a> {
2742
CsvRowSplit {
2843
row: self.row,
29-
char_indices: self.row.char_indices(),
44+
char_indices: self.row.char_indices().peekable(),
3045
sep,
3146
quote,
32-
done: false,
33-
field_start: 0,
34-
field_end_shift: 0,
35-
in_quoted_field: false,
36-
has_separator: false, // whether a field has a CSV sep in it
47+
parse_done: false,
48+
field_start_index: 0,
49+
field_is_quoted: false,
50+
field_has_separator: false, // whether a field has a CSV sep within it
51+
cur_in_quoted_field: false,
52+
cur_is_field_start: true, // whether current position is the start of a field
3753
}
3854
}
3955
}
4056

57+
impl<'a> CsvRowSplit<'a> {}
58+
4159
impl<'a> Iterator for CsvRowSplit<'a> {
4260
type Item = &'a str;
4361

4462
fn next(&mut self) -> Option<Self::Item> {
45-
if self.done {
63+
if self.parse_done {
4664
return None;
4765
}
4866

4967
loop {
50-
if let Some((index, c)) = self.char_indices.next() {
51-
if c == '\\' {
52-
self.char_indices.next();
53-
} else if c == self.sep {
54-
if self.in_quoted_field {
55-
self.has_separator = true;
68+
let Some((index, c)) = self.char_indices.next() else {
69+
// obtain last field
70+
self.parse_done = true;
71+
let field_shift = self.field_is_quoted as usize - self.field_has_separator as usize;
72+
let i = self.field_start_index + field_shift;
73+
let j = self.row.len() - field_shift;
74+
let f = unsafe { self.row.get_unchecked(i..j) };
75+
return Some(f);
76+
};
77+
78+
if c == '\\' {
79+
// skip \ escape, e.g., v1,v2\",v3 is parsed into ["v1", "v2\"", "v3"]
80+
self.char_indices.next();
81+
} else if c == self.sep {
82+
if self.cur_in_quoted_field {
83+
self.field_has_separator = true;
84+
} else {
85+
let field_shift =
86+
self.field_is_quoted as usize - self.field_has_separator as usize;
87+
let i = self.field_start_index + field_shift;
88+
let j = index - field_shift;
89+
let f = unsafe { self.row.get_unchecked(i..j) };
90+
91+
self.field_start_index = index + 1;
92+
self.field_is_quoted = false;
93+
self.field_has_separator = false;
94+
self.cur_in_quoted_field = false;
95+
self.cur_is_field_start = true;
96+
97+
return Some(f);
98+
}
99+
} else if c == self.quote {
100+
if self.cur_is_field_start {
101+
self.field_is_quoted = true;
102+
self.cur_in_quoted_field = true;
103+
} else {
104+
let next_char = self.char_indices.peek();
105+
if next_char.is_none() || next_char.is_some_and(|(_, v)| v == &self.sep) {
106+
self.cur_in_quoted_field = false;
56107
} else {
57-
let has_sep = self.has_separator as usize;
58-
let i = self.field_start - has_sep;
59-
let j = index - self.field_end_shift + has_sep;
60-
let f = unsafe { self.row.get_unchecked(i..j) };
61-
62-
self.field_start = index + 1;
63-
self.field_end_shift = 0;
64-
self.has_separator = false;
65-
return Some(f);
108+
// skip double-quotes escape, e.g., v1,v2"",v3 is parsed into ["v1", "v2""", "v3"]
109+
self.char_indices.next();
66110
}
67-
} else if c == self.quote {
68-
let i = self.in_quoted_field as usize;
69-
self.field_start += i;
70-
self.field_end_shift += i;
71-
72-
self.in_quoted_field = !self.in_quoted_field;
73111
}
74-
} else {
75-
self.done = true;
76-
let has_sep = self.has_separator as usize;
77-
let i = self.field_start - has_sep;
78-
let j = self.row.len() - self.field_end_shift + has_sep;
79-
let f = unsafe { self.row.get_unchecked(i..j) };
80-
return Some(f);
81112
}
113+
114+
self.cur_is_field_start = false;
82115
}
83116
}
84117
}
@@ -90,33 +123,57 @@ mod tests {
90123

91124
#[test]
92125
fn test_csv_row_split() {
126+
let r = "我们abc,def,12";
127+
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
128+
assert_eq!(o, vec!["我们abc", "def", "12"]);
129+
93130
let r = "1,2,3,";
94131
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
95132
assert_eq!(o, vec!["1", "2", "3", ""]);
96133

97-
let r = "\"1\",2,3,";
134+
let r = r#"1,2,3,"""#;
98135
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
99136
assert_eq!(o, vec!["1", "2", "3", ""]);
100137

101-
let r = "first,second,\"third,fourth\",fifth";
138+
let r = r#"1,2,3,"",4"#;
102139
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
103-
assert_eq!(o, vec!["first", "second", "\"third,fourth\"", "fifth"]);
140+
assert_eq!(o, vec!["1", "2", "3", "", "4"]);
104141

105-
let r = "first,second,\"third,fourth\",\"fifth\"";
142+
let r = r#"1,2,3,"","4""#;
106143
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
107-
assert_eq!(o, vec!["first", "second", "\"third,fourth\"", "fifth"]);
144+
assert_eq!(o, vec!["1", "2", "3", "", "4"]);
108145

109-
let r = "\"third,fourth\",\"fifth\"";
146+
// quoted field
147+
let r = r#""1",2,3,"#;
110148
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
111-
assert_eq!(o, vec!["\"third,fourth\"", "fifth"]);
149+
assert_eq!(o, vec!["1", "2", "3", ""]);
112150

113-
let r = "我们abc,def,12";
151+
// comma in quoted field
152+
let r = r#"first,second,"third,fourth",fifth"#;
114153
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
115-
assert_eq!(o, vec!["我们abc", "def", "12"]);
154+
assert_eq!(o, vec!["first", "second", r#""third,fourth""#, "fifth"]);
155+
156+
let r = r#"first,second,"third,fourth","fifth""#;
157+
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
158+
assert_eq!(o, vec!["first", "second", r#""third,fourth""#, "fifth"]);
159+
160+
let r = r#""third,fourth","fifth""#;
161+
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
162+
assert_eq!(o, vec![r#""third,fourth""#, "fifth"]);
163+
164+
// double-quote in field,, escaped by a preceding \
165+
let r = r#"third\",fourth,"fifth""#;
166+
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
167+
assert_eq!(o, vec![r#"third\""#, "fourth", "fifth"]);
116168

117-
// double-quote in field
118-
let r = "\"third\\\",fourth\",\"fifth\"";
169+
let r = r#""Charles ""Pretty Boy"" Floyd","1 Short St, Smallville""#;
119170
let o = CsvRow::new(&r).split(',', '"').collect::<Vec<_>>();
120-
assert_eq!(o, vec!["\"third\\\",fourth\"", "fifth"]);
171+
assert_eq!(
172+
o,
173+
vec![
174+
r#"Charles ""Pretty Boy"" Floyd"#,
175+
r#""1 Short St, Smallville""#
176+
]
177+
);
121178
}
122179
}

0 commit comments

Comments
 (0)