Skip to content

Commit

Permalink
fix: csvformat supports --out-quoting 2. --quoting (and --out-quoting…
Browse files Browse the repository at this point in the history
…) support options from Python 3.12.
  • Loading branch information
jpmckinney committed Apr 28, 2024
1 parent b3f68a3 commit 95dc26d
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 43 deletions.
12 changes: 9 additions & 3 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
2.0.0 - Unreleased
------------------

**BACKWARDS-INCOMPATIBLE CHANGES**
**BACKWARDS-INCOMPATIBLE CHANGES:**

* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.

Other changes:

* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
* fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNUMERIC>`__). Use the :code:`--locale` option to set the locale of any formatted numbers.

1.5.0 - March 28, 2024
----------------------

Expand All @@ -21,7 +27,7 @@
* :code:`--sniff-limit``
* :code:`--no-inference``

* feat: :doc:`/scripts/csvpy` removes the ``--linenumbers`` and ``--zero`` output options, which had no effect.
* feat: :doc:`/scripts/csvpy` removes the :code:`--linenumbers` and :code:`--zero` output options, which had no effect.
* feat: :doc:`/scripts/in2csv` adds a :code:`--reset-dimensions` option to `recalculate <https://openpyxl.readthedocs.io/en/stable/optimized.html#worksheet-dimensions>`_ the dimensions of an XLSX file, instead of trusting the file's metadata. csvkit's dependency `agate-excel <https://agate-excel.readthedocs.io/en/latest/>`_ 0.4.0 automatically recalculates the dimensions if the file's metadata expresses dimensions of "A1:A1" (a single cell).
* fix: :doc:`/scripts/csvlook` only reads up to :code:`--max-rows` rows instead of the entire file.
* fix: :doc:`/scripts/csvpy` supports the existing input options:
Expand Down Expand Up @@ -61,7 +67,7 @@
1.2.0 - October 4, 2023
-----------------------

* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a ``--right`` join.
* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a :code:`--right` join.
* Add SQLAlchemy 2 support.
* Drop Python 3.7 support (end-of-life was June 5, 2023).

Expand Down
18 changes: 10 additions & 8 deletions csvkit/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python

import argparse
import bz2
import csv
Expand All @@ -22,6 +21,8 @@
except ImportError:
zstandard = None

QUOTING_CHOICES = sorted(getattr(csv, name) for name in dir(csv) if name.startswith('QUOTE_'))


class LazyFile:
"""
Expand Down Expand Up @@ -170,17 +171,17 @@ def _init_common_parser(self):
help='Character used to quote strings in the input CSV file.')
if 'u' not in self.override_flags:
self.argparser.add_argument(
'-u', '--quoting', dest='quoting', type=int, choices=[0, 1, 2, 3],
help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, '
'2 = Quote Non-numeric, 3 = Quote None.')
'-u', '--quoting', dest='quoting', type=int, choices=QUOTING_CHOICES,
help='Quoting style used in the input CSV file: 0 quote minimal, 1 quote all, '
'2 quote non-numeric, 3 quote none.')
if 'b' not in self.override_flags:
self.argparser.add_argument(
'-b', '--no-doublequote', dest='doublequote', action='store_false',
help='Whether or not double quotes are doubled in the input CSV file.')
if 'p' not in self.override_flags:
self.argparser.add_argument(
'-p', '--escapechar', dest='escapechar',
help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape '
help='Character used to escape the delimiter if --quoting 3 ("quote none") is specified and to escape '
'the QUOTECHAR if --no-doublequote is specified.')
if 'z' not in self.override_flags:
self.argparser.add_argument(
Expand Down Expand Up @@ -337,12 +338,13 @@ def get_column_types(self):
type_kwargs['null_values'].append(null_value)

text_type = agate.Text(**type_kwargs)
number_type = agate.Number(locale=self.args.locale, **type_kwargs)

if self.args.no_inference:
if getattr(self.args, 'no_inference', None):
types = [text_type]
elif getattr(self.args, 'out_quoting', None) == 2:
types = [number_type, text_type]
else:
number_type = agate.Number(locale=self.args.locale, **type_kwargs)

# See the order in the `agate.TypeTester` class.
types = [
agate.Boolean(**type_kwargs),
Expand Down
45 changes: 30 additions & 15 deletions csvkit/utilities/csvformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

import agate

from csvkit.cli import CSVKitUtility, make_default_headers
from csvkit.cli import QUOTING_CHOICES, CSVKitUtility, make_default_headers


class CSVFormat(CSVKitUtility):
description = 'Convert a CSV file to a custom output format.'
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']
override_flags = ['blanks', 'date-format', 'datetime-format']

def add_arguments(self):
self.argparser.add_argument(
Expand All @@ -29,9 +29,9 @@ def add_arguments(self):
'-Q', '--out-quotechar', dest='out_quotechar',
help='Character used to quote strings in the output file.')
self.argparser.add_argument(
'-U', '--out-quoting', dest='out_quoting', type=int, choices=[0, 1, 2, 3],
help='Quoting style used in the output file. 0 = Quote Minimal, 1 = Quote All, '
'2 = Quote Non-numeric, 3 = Quote None.')
'-U', '--out-quoting', dest='out_quoting', type=int, choices=QUOTING_CHOICES,
help='Quoting style used in the output file: 0 quote minimal, 1 quote all, '
'2 quote non-numeric, 3 quote none.')
self.argparser.add_argument(
'-B', '--out-no-doublequote', dest='out_doublequote', action='store_false',
help='Whether or not double quotes are doubled in the output file.')
Expand Down Expand Up @@ -72,18 +72,33 @@ def main(self):
if self.additional_input_expected():
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')

reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
if self.args.no_header_row:
# Peek at a row to get the number of columns.
_row = next(reader)
headers = make_default_headers(len(_row))
reader = itertools.chain([headers, _row], reader)

if self.args.skip_header:
next(reader)

writer.writerows(reader)
if self.args.out_quoting == 2:
table = agate.Table.from_csv(
self.input_file,
skip_lines=self.args.skip_lines,
column_types=self.get_column_types(),
**self.reader_kwargs,
)

# table.to_csv() has no option to omit the column names.
if not self.args.skip_header:
writer.writerow(table.column_names)

writer.writerows(table.rows)
else:
reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
if self.args.no_header_row:
# Peek at a row to get the number of columns.
_row = next(reader)
headers = make_default_headers(len(_row))
reader = itertools.chain([headers, _row], reader)

if self.args.skip_header:
next(reader)

writer.writerows(reader)


def launch_new_instance():
Expand Down
8 changes: 4 additions & 4 deletions docs/common_arguments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ csvkit's tools share a set of common command-line arguments. Not every argument
-q QUOTECHAR, --quotechar QUOTECHAR
Character used to quote strings in the input CSV file.
-u {0,1,2,3}, --quoting {0,1,2,3}
Quoting style used in the input CSV file. 0 = Quote
Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 =
Quote None.
Quoting style used in the input CSV file: 0 quote
minimal, 1 quote all, 2 quote non-numeric, 3 quote
none.
-b, --no-doublequote Whether or not double quotes are doubled in the input
CSV file.
-p ESCAPECHAR, --escapechar ESCAPECHAR
Character used to escape the delimiter if --quoting 3
("Quote None") is specified and to escape the
("quote none") is specified and to escape the
QUOTECHAR if --no-doublequote is specified.
-z FIELD_SIZE_LIMIT, --maxfieldsize FIELD_SIZE_LIMIT
Maximum length of a single field in the input CSV
Expand Down
2 changes: 1 addition & 1 deletion docs/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Currently, the following tools stream:

* :doc:`/scripts/csvclean`
* :doc:`/scripts/csvcut`
* :doc:`/scripts/csvformat`
* :doc:`/scripts/csvformat` unless :code:`--quoting 2` is set
* :doc:`/scripts/csvgrep`
* :doc:`/scripts/csvstack`
* :doc:`/scripts/sql2csv`
Expand Down
6 changes: 0 additions & 6 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
Release process
===============

.. admonition:: One-time setup

.. code-block:: bash
pip install --upgrade build twine
#. All tests pass on continuous integration
#. The changelog is up-to-date and dated
#. If new options are added, regenerate the usage information in the documentation with, for example:
Expand Down
12 changes: 6 additions & 6 deletions docs/scripts/csvformat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ Convert a CSV file to a custom output format.:
.. code-block:: none
usage: csvformat [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-E]
[-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR]
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING]
[-L LOCALE] [-S] [-H] [-K SKIP_LINES] [-v] [-l] [--zero] [-V]
[-E] [-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR]
[-U {0,1,2,3}] [-B] [-P OUT_ESCAPECHAR]
[-M OUT_LINETERMINATOR]
[FILE]
Expand All @@ -36,9 +36,9 @@ Convert a CSV file to a custom output format.:
-Q OUT_QUOTECHAR, --out-quotechar OUT_QUOTECHAR
Character used to quote strings in the output file.
-U {0,1,2,3}, --out-quoting {0,1,2,3}
Quoting style used in the output file. 0 = Quote
Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 =
Quote None.
Quoting style used in the output file: 0 quote
minimal, 1 quote all, 2 quote non-numeric, 3 quote
none.
-B, --out-no-doublequote
Whether or not double quotes are doubled in the output
CSV file.
Expand Down
99 changes: 99 additions & 0 deletions tests/test_utilities/test_csvformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,102 @@ def test_lineterminator(self):
self.assertLines(['-M', 'XYZ', 'examples/dummy.csv'], [
'a,b,cXYZ1,2,3XYZ',
], newline_at_eof=False)


class TestCSVFormatQuoteNonNumeric(CSVKitTestCase, EmptyFileTests):
Utility = CSVFormat

# New test compared to TestCSVFormat.
def test_locale(self):
self.assertLines(['-U', '2', '--locale', 'de_DE', 'examples/test_locale.csv'], [
'"a","b","c"',
'1.7,200000000,""',
])


def test_launch_new_instance(self):
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']):
launch_new_instance()

def test_skip_lines(self):
self.assertLines(['-U', '2', '--skip-lines', '3', '-D', '|', 'examples/test_skip_lines.csv'], [
'"a"|"b"|"c"',
'1|2|3',
])

def test_skip_header(self):
self.assertLines(['-U', '2', '--skip-header', 'examples/dummy.csv'], [
'1,2,3',
])

def test_skip_header_no_header_row(self):
self.assertLines(['-U', '2', '--no-header-row', '--skip-header', 'examples/no_header_row.csv'], [
'1,2,3',
])

def test_no_header_row(self):
self.assertLines(['-U', '2', '--no-header-row', 'examples/no_header_row.csv'], [
'"a","b","c"',
'1,2,3',
])

def test_linenumbers(self):
self.assertLines(['-U', '2', '--linenumbers', 'examples/dummy.csv'], [
'"line_number","a","b","c"',
'1,1,2,3',
])

def test_delimiter(self):
self.assertLines(['-U', '2', '-D', '|', 'examples/dummy.csv'], [
'"a"|"b"|"c"',
'1|2|3',
])

def test_tabs(self):
self.assertLines(['-U', '2', '-T', 'examples/dummy.csv'], [
'"a"\t"b"\t"c"',
'1\t2\t3',
])

def test_asv(self):
self.assertLines(['-U', '2', '-A', 'examples/dummy.csv'], [
'"a"\x1f"b"\x1f"c"\x1e1\x1f2\x1f3\x1e',
], newline_at_eof=False)

def test_quotechar(self):
input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n')

with stdin_as_string(input_file):
self.assertLines(['-U', '2', '-Q', '*'], [
'*a*,*b*,*c*',
'*1**2*,3,4',
])

input_file.close()

def test_doublequote(self):
input_file = io.BytesIO(b'a\n"a ""quoted"" string"')

with stdin_as_string(input_file):
self.assertLines(['-U', '2', '-P', '#', '-B'], [
'"a"',
'"a #"quoted#" string"',
])

input_file.close()

def test_escapechar(self):
input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n')

with stdin_as_string(input_file):
self.assertLines(['-U', '2', '-P', '#', '-U', '3'], [
'a,b,c',
'1#"2,3,4',
])

input_file.close()

def test_lineterminator(self):
self.assertLines(['-U', '2', '-M', 'XYZ', 'examples/dummy.csv'], [
'"a","b","c"XYZ1,2,3XYZ',
], newline_at_eof=False)

0 comments on commit 95dc26d

Please sign in to comment.