55This module provides a CSV parser and writer.
66"""
77
8- from typing import (
9- Any ,
10- Dict ,
11- Iterable ,
12- List ,
13- Optional ,
14- Union ,
15- TextIO ,
16- Sequence ,
17- Type ,
18- )
8+ from typing import Any , Dict , Iterable , List , Optional , Sequence , TextIO , Union
199
2010# Quoting styles
2111QUOTE_MINIMAL = 0
@@ -337,20 +327,30 @@ def sniff(
337327 # For now, use a heuristic: consistent number of fields
338328 first_line_fields = - 1
339329 line_consistency = 0
330+ total_delim_count = 0
340331 for i , line in enumerate (
341332 lines [:10 ]
342333 ): # Check consistency over more lines
343334 # A very simple split, doesn't respect quoting for now for sniffing delimiter
344335 fields = line .split (delim_char )
336+ total_delim_count += line .count (delim_char )
345337 if i == 0 :
346338 first_line_fields = len (fields )
347- if first_line_fields > 0 :
339+ if (
340+ first_line_fields > 1
341+ ): # Need at least 2 fields to be meaningful
348342 line_consistency += 1
349343 elif len (fields ) == first_line_fields :
350344 line_consistency += 1
351345
352- if first_line_fields > 0 and line_consistency > max_consistency :
353- max_consistency = line_consistency
346+ # Score based on consistency and delimiter frequency
347+ score = line_consistency * 10 + total_delim_count
348+ if (
349+ first_line_fields > 1
350+ and score > max_consistency
351+ and total_delim_count > 0
352+ ):
353+ max_consistency = score
354354 best_dialect_params = potential_dialect_params
355355 best_dialect_params .setdefault ("quotechar" , '"' ) # Ensure a default
356356 best_dialect_params .setdefault ("doublequote" , True )
@@ -367,7 +367,7 @@ def sniff(
367367 except Exception : # Broad exception if parsing attempt fails
368368 continue
369369
370- if not best_dialect_params :
370+ if not best_dialect_params or max_consistency <= 0 :
371371 raise Error ("Could not determine delimiter" )
372372
373373 # Create a Dialect instance. Sniffer in CPython returns a dialect *class*,
@@ -470,6 +470,7 @@ def reader(
470470 quotechar = d .quotechar
471471 quoting = d .quoting
472472 skipinitialspace = d .skipinitialspace
473+ lineterminator = d .lineterminator
473474 # strict = d.strict # TODO: Use strict mode
474475
475476 if not csvfile :
@@ -488,8 +489,8 @@ def reader(
488489 raise Error (f"field larger than field limit ({ _field_size_limit } )" )
489490
490491 row_str = row_str_orig .rstrip (
491- " \r \n "
492- ) # Reader should not depend on lineterminator from dialect
492+ lineterminator
493+ ) # Reader should use dialect's lineterminator
493494
494495 fields : List [str ] = []
495496 current_field : str = ""
@@ -579,9 +580,7 @@ def reader(
579580 pass
580581 else :
581582 if d .strict :
582- raise Error (
583- f"'{ delimiter } ' expected after '{ quotechar } ' at char { idx } , found '{ char } '"
584- )
583+ raise Error (f"delimiter expected after '{ quotechar } '" )
585584 # If not strict, CPython CSV often appends this char to the field or starts a new unquoted field.
586585 # This behavior is complex. For simplicity, we'll be strict or error-prone here.
587586 # Let's assume for now it's an error if strict, or append to field if not (though might be wrong for some cases)
@@ -604,7 +603,7 @@ def reader(
604603 if d .strict or not (
605604 escapechar and row_str .endswith (escapechar )
606605 ): # CPython behavior for unclosed quote
607- raise Error ("unexpected end of data - unclosed quote" )
606+ raise Error ("unclosed quote" )
608607 if state == ESCAPE :
609608 raise Error ("unexpected end of data - incomplete escape sequence" )
610609
@@ -670,7 +669,10 @@ def writerow(self, row: _Row) -> None:
670669 elif quoting == QUOTE_NONNUMERIC :
671670 if quotechar is None :
672671 raise Error ("quotechar must be set for QUOTE_NONNUMERIC" )
673- if not isinstance (field_obj , (int , float )):
672+ # Check for boolean first since isinstance(bool, int) is True
673+ if isinstance (field_obj , bool ) or not isinstance (
674+ field_obj , (int , float )
675+ ):
674676 needs_quoting = True
675677 else :
676678 if quotechar and (
@@ -702,20 +704,21 @@ def writerow(self, row: _Row) -> None:
702704 continue
703705
704706 if needs_quoting and quotechar :
705- escaped_field = ""
707+ escaped_field = field_str # Start with the original field
706708 if doublequote :
707- escaped_field = field_str .replace (quotechar , quotechar * 2 )
709+ escaped_field = escaped_field .replace (quotechar , quotechar * 2 )
708710 elif escapechar :
709- escaped_field = field_str .replace (escapechar , escapechar * 2 )
711+ escaped_field = escaped_field .replace (escapechar , escapechar * 2 )
710712 escaped_field = escaped_field .replace (
711713 quotechar , escapechar + quotechar
712714 )
713715 else :
714716 # This case means quotechar is in field, needs_quoting is true,
715717 # but no mechanism (doublequote=F, escapechar=None) to escape it.
716- raise Error (
717- "quotechar found in field, but no escape mechanism (doublequote=False, escapechar=None)"
718- )
718+ if quotechar in field_str :
719+ raise Error (
720+ "quotechar found in field, but no escape mechanism (doublequote=False, escapechar=None)"
721+ )
719722
720723 processed_fields .append (quotechar + escaped_field + quotechar )
721724 else :
0 commit comments