1- use std:: str:: CharIndices ;
1+ // CSV row split module that supports:
2+ // 1. double-quoted field
3+ // 2. comma in a double-quoted field
4+ // 3. double-quotes in a field escaped by a backslash \
5+ // 4. double-quotes in a field escaped by a preceding double-quotes as discussed in
6+ // https://stackoverflow.com/questions/17808511/how-to-properly-escape-a-double-quote-in-csv
7+
8+ // worked for examples:
9+ // v1,v2,v3
10+ // "v1","v2","v3"
11+ // "v1",v2,v3
12+ // "Charles \"Pretty Boy\" Floyd","1 Short St, Smallville"
13+ // "Charles ""Pretty Boy"" Floyd","1 Short St, Smallville"
14+
15+ use std:: { iter:: Peekable , str:: CharIndices } ;
216
317#[ derive( Debug ) ]
418pub struct CsvRow < ' a > {
@@ -8,14 +22,15 @@ pub struct CsvRow<'a> {
822#[ derive( Debug ) ]
923pub struct CsvRowSplit < ' a > {
1024 row : & ' a str ,
11- char_indices : CharIndices < ' a > ,
25+ char_indices : Peekable < CharIndices < ' a > > ,
1226 sep : char ,
1327 quote : char ,
14- done : bool ,
15- field_start : usize ,
16- field_end_shift : usize ,
17- in_quoted_field : bool ,
18- has_separator : bool ,
28+ parse_done : bool ,
29+ field_start_index : usize ,
30+ field_is_quoted : bool ,
31+ field_has_separator : bool ,
32+ cur_in_quoted_field : bool ,
33+ cur_is_field_start : bool ,
1934}
2035
2136impl < ' a > CsvRow < ' a > {
@@ -26,59 +41,77 @@ impl<'a> CsvRow<'a> {
2641 pub fn split ( self , sep : char , quote : char ) -> CsvRowSplit < ' a > {
2742 CsvRowSplit {
2843 row : self . row ,
29- char_indices : self . row . char_indices ( ) ,
44+ char_indices : self . row . char_indices ( ) . peekable ( ) ,
3045 sep,
3146 quote,
32- done : false ,
33- field_start : 0 ,
34- field_end_shift : 0 ,
35- in_quoted_field : false ,
36- has_separator : false , // whether a field has a CSV sep in it
47+ parse_done : false ,
48+ field_start_index : 0 ,
49+ field_is_quoted : false ,
50+ field_has_separator : false , // whether a field has a CSV sep within it
51+ cur_in_quoted_field : false ,
52+ cur_is_field_start : true , // whether current position is the start of a field
3753 }
3854 }
3955}
4056
57+ impl < ' a > CsvRowSplit < ' a > { }
58+
4159impl < ' a > Iterator for CsvRowSplit < ' a > {
4260 type Item = & ' a str ;
4361
4462 fn next ( & mut self ) -> Option < Self :: Item > {
45- if self . done {
63+ if self . parse_done {
4664 return None ;
4765 }
4866
4967 loop {
50- if let Some ( ( index, c) ) = self . char_indices . next ( ) {
51- if c == '\\' {
52- self . char_indices . next ( ) ;
53- } else if c == self . sep {
54- if self . in_quoted_field {
55- self . has_separator = true ;
68+ let Some ( ( index, c) ) = self . char_indices . next ( ) else {
69+ // obtain last field
70+ self . parse_done = true ;
71+ let field_shift = self . field_is_quoted as usize - self . field_has_separator as usize ;
72+ let i = self . field_start_index + field_shift;
73+ let j = self . row . len ( ) - field_shift;
74+ let f = unsafe { self . row . get_unchecked ( i..j) } ;
75+ return Some ( f) ;
76+ } ;
77+
78+ if c == '\\' {
79+ // skip \ escape, e.g., v1,v2\",v3 is parsed into ["v1", "v2\"", "v3"]
80+ self . char_indices . next ( ) ;
81+ } else if c == self . sep {
82+ if self . cur_in_quoted_field {
83+ self . field_has_separator = true ;
84+ } else {
85+ let field_shift =
86+ self . field_is_quoted as usize - self . field_has_separator as usize ;
87+ let i = self . field_start_index + field_shift;
88+ let j = index - field_shift;
89+ let f = unsafe { self . row . get_unchecked ( i..j) } ;
90+
91+ self . field_start_index = index + 1 ;
92+ self . field_is_quoted = false ;
93+ self . field_has_separator = false ;
94+ self . cur_in_quoted_field = false ;
95+ self . cur_is_field_start = true ;
96+
97+ return Some ( f) ;
98+ }
99+ } else if c == self . quote {
100+ if self . cur_is_field_start {
101+ self . field_is_quoted = true ;
102+ self . cur_in_quoted_field = true ;
103+ } else {
104+ let next_char = self . char_indices . peek ( ) ;
105+ if next_char. is_none ( ) || next_char. is_some_and ( |( _, v) | v == & self . sep ) {
106+ self . cur_in_quoted_field = false ;
56107 } else {
57- let has_sep = self . has_separator as usize ;
58- let i = self . field_start - has_sep;
59- let j = index - self . field_end_shift + has_sep;
60- let f = unsafe { self . row . get_unchecked ( i..j) } ;
61-
62- self . field_start = index + 1 ;
63- self . field_end_shift = 0 ;
64- self . has_separator = false ;
65- return Some ( f) ;
108+ // skip double-quotes escape, e.g., v1,v2"",v3 is parsed into ["v1", "v2""", "v3"]
109+ self . char_indices . next ( ) ;
66110 }
67- } else if c == self . quote {
68- let i = self . in_quoted_field as usize ;
69- self . field_start += i;
70- self . field_end_shift += i;
71-
72- self . in_quoted_field = !self . in_quoted_field ;
73111 }
74- } else {
75- self . done = true ;
76- let has_sep = self . has_separator as usize ;
77- let i = self . field_start - has_sep;
78- let j = self . row . len ( ) - self . field_end_shift + has_sep;
79- let f = unsafe { self . row . get_unchecked ( i..j) } ;
80- return Some ( f) ;
81112 }
113+
114+ self . cur_is_field_start = false ;
82115 }
83116 }
84117}
@@ -90,33 +123,57 @@ mod tests {
90123
91124 #[ test]
92125 fn test_csv_row_split ( ) {
126+ let r = "我们abc,def,12" ;
127+ let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
128+ assert_eq ! ( o, vec![ "我们abc" , "def" , "12" ] ) ;
129+
93130 let r = "1,2,3," ;
94131 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
95132 assert_eq ! ( o, vec![ "1" , "2" , "3" , "" ] ) ;
96133
97- let r = " \" 1 \" ,2,3,";
134+ let r = r#"1 ,2,3,"""# ;
98135 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
99136 assert_eq ! ( o, vec![ "1" , "2" , "3" , "" ] ) ;
100137
101- let r = "first,second, \" third,fourth \" ,fifth" ;
138+ let r = r#"1,2,3,"",4"# ;
102139 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
103- assert_eq ! ( o, vec![ "first " , "second " , "\" third,fourth \ " " , "fifth " ] ) ;
140+ assert_eq ! ( o, vec![ "1 " , "2 " , "3" , "" , "4 " ] ) ;
104141
105- let r = "first,second, \" third,fourth \" , \" fifth \" " ;
142+ let r = r#"1,2,3,"","4""# ;
106143 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
107- assert_eq ! ( o, vec![ "first " , "second " , "\" third,fourth \ " " , "fifth " ] ) ;
144+ assert_eq ! ( o, vec![ "1 " , "2 " , "3" , "" , "4 " ] ) ;
108145
109- let r = "\" third,fourth\" ,\" fifth\" " ;
146+ // quoted field
147+ let r = r#""1",2,3,"# ;
110148 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
111- assert_eq ! ( o, vec![ "\" third,fourth \" " , "fifth " ] ) ;
149+ assert_eq ! ( o, vec![ "1" , "2 ", "3" , " "] ) ;
112150
113- let r = "我们abc,def,12" ;
151+ // comma in quoted field
152+ let r = r#"first,second,"third,fourth",fifth"# ;
114153 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
115- assert_eq ! ( o, vec![ "我们abc" , "def" , "12" ] ) ;
154+ assert_eq ! ( o, vec![ "first" , "second" , r#""third,fourth""# , "fifth" ] ) ;
155+
156+ let r = r#"first,second,"third,fourth","fifth""# ;
157+ let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
158+ assert_eq ! ( o, vec![ "first" , "second" , r#""third,fourth""# , "fifth" ] ) ;
159+
160+ let r = r#""third,fourth","fifth""# ;
161+ let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
162+ assert_eq ! ( o, vec![ r#""third,fourth""# , "fifth" ] ) ;
163+
164+ // double-quote in field,, escaped by a preceding \
165+ let r = r#"third\",fourth,"fifth""# ;
166+ let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
167+ assert_eq ! ( o, vec![ r#"third\""# , "fourth" , "fifth" ] ) ;
116168
117- // double-quote in field
118- let r = "\" third\\ \" ,fourth\" ,\" fifth\" " ;
169+ let r = r#""Charles ""Pretty Boy"" Floyd","1 Short St, Smallville""# ;
119170 let o = CsvRow :: new ( & r) . split ( ',' , '"' ) . collect :: < Vec < _ > > ( ) ;
120- assert_eq ! ( o, vec![ "\" third\\ \" ,fourth\" " , "fifth" ] ) ;
171+ assert_eq ! (
172+ o,
173+ vec![
174+ r#"Charles ""Pretty Boy"" Floyd"# ,
175+ r#""1 Short St, Smallville""#
176+ ]
177+ ) ;
121178 }
122179}
0 commit comments