mysql2csv

#!/usr/bin/env python
# mysql2csv (part of ossobv/vcutil) // wdoekes/2011-2024 // Public Domain
#
# This mysql2csv dump script is a convenient way to dump and/or prune
# large tables. Instead of selecting all data at once, it takes chunks
# of data based on a datetime column supplied by the caller.
#
# For example, if you run this:
#
#     mysql2csv ... mydatabase large_table created
#
# Then the dump script will check the MIN(created) on large_table, and
# from that point, start BETWEEN queries between that time and the next
# hour.
#
#     SELECT * FROM mydatabase.large_table
#     WHERE created >= '2011-01-21 03:00:00' AND
#           created  < '2011-01-21 04:00:00'
#     ORDER BY created
#
# If you have proper indexes on the created column, these
# "hour-selecting" queries should be light enough to handle without
# locking up your production database.
#
# If you have an index on an integer column, that can be used too, but
# then you'll want to pass a timestamp column as extra argument:
#
#     mysql2csv ... mydatabase large_table id created
#
# When selecting by id, you may want to fetch larger chunks at once (see
# the --period-size argument).
#
# See mysql2csv -h for more info.
#
# Created by Walter Doekes, OSSO B.V.
#
# Changes:
# - 2011-01-21: Initial from http://wjd.nu/files/2011/01/mysql2csv.py
# - 2016-05-23: PEP cleanup, add into vcutil repository.
# - 2016-05-23: Only write CSVs if there is data for that period. End
#               before until_date if the max date is lower.
# - 2016-05-23: Use argparse for the options. Read ini file as well.
# - 2016-05-23: Add --quiet option to silence the output.
# - 2016-05-23: Append to CSV instead clobbering it. Safer for data.
# - 2016-05-23: Allow sleep-time to be set.
# - 2016-05-23: Prune during dump, using the same date filters.
# - 2016-05-23: Fix problem with TIMESTAMP column and DST changes.
# - 2016-05-25: Add --extra-where parameter to limit the query results.
# - 2019-04-02: Trap incorrect None conversions by MySQLdb: better crash
#               than silently truncate/NULLify values.
# - 2019-04-02: Handle YYYY-mm-dd HH:MM:SS[.qqq[qqq]] TIMESTAMP values.
# - 2022-03-04: Compatibility tweaks (py2/3, DATETIME NULL, BLOB).
# - 2022-12-05: Add the option to sort/filter by integer column.
# - 2023-10-26: When sorting by integer, allow time based templates,
#               by reading the time from another (unindexed) column.
# - 2023-10-26: Rename -t/--template to -o/--output.
# - 2024-01-10: Sanitize --prune parts after previous refactor.
#
# Caveats:
# - mysql2csv clobbers existing files without asking!
# - When using a different column (date_column) for the filename template,
#   mysql2csv will not close files until the end. If you use %Y%m%d, you get
#   up to 4096 file descriptors when processing 11 years at once.
#
from __future__ import print_function, unicode_literals
import argparse  # pip install argparse on old pre-2.7 python
import getpass
import sys
import time

from base64 import b16encode
from datetime import date, datetime, timedelta
from decimal import Decimal
from os import utime

from MySQLdb import OperationalError, connect, converters

try:
    from configparser import RawConfigParser
    long = int
    unicode = str
except ImportError:  # python2-
    from ConfigParser import RawConfigParser


DEFAULT_FILENAME = '%(database)s.%(table)s.%%Y%%m.csv'
DEFAULT_SLEEP_TIME = 0.005  # sleep N secs every iteration, easing server load
EXAMPLE_UNTIL_DATE = datetime(2035, 1, 1)
ONE_HOUR = timedelta(seconds=3600)
TWO_HOURS = timedelta(seconds=7200)


class ConfigError(ValueError):
    pass


def _make_conversion_safe(orig_conversion):
    """
    Wrap orig_conversion with one that checks that the returned value is
    not None.

    We do NOT want this flaky behaviour::

        def Date_or_None(s):
            try: return date(*[ int(x) for x in s.split('-',2)])
            except: return None

    If the type conversion is not recognised by MySQLdb, dont't have it
    convert to None. This is Python, not let-me-silently-downcast-this-PHP.
    """
    def _safe_conversion(s):
        "Assert that None is never returned."
        ret = orig_conversion(s)
        if ret is None:
            # > select ts, ts is null, ts = '0000-00-00 00:00:00'
            #   from communication where ts is null;
            # +---------------------+------------+----------------------------+
            # | ts                  | ts is null | ts = '0000-00-00 00:00:00' |
            # +---------------------+------------+----------------------------+
            # | 0000-00-00 00:00:00 |          0 |                          1 |
            # +---------------------+------------+----------------------------+
            # ... so ts IS NULL but NOT NULL at the same time.
            if s == '0000-00-00 00:00:00':
                return "0000-00-00 00:00:00"
            raise NotImplementedError(
                'conversion from {!r} yielded None; bug in MySQLdb?'.format(s))
        return ret
    return _safe_conversion


def _make_conversion_safe_tuple(orig_tuple):
    """
    Update conversion function with a safe-wrapped one.

    Note that conversions *from* SQL to Python will have integer-keys.
    We're only concerned with those.

    The values can be either (a) a conversion function, or (b) a list of
    tuples where the second element is the conversion function.
    """
    k, v = orig_tuple
    if not isinstance(k, int):
        return k, v
    if isinstance(v, (list, tuple)):
        new_list = [(i[0], _make_conversion_safe(i[1])) for i in v]
        return k, new_list
    return k, _make_conversion_safe(v)


def _make_timestamp_conversion(orig_conversion):
    """
    Create conversion function with fallback to original.
    """
    def _timestamp_conversion(s):
        "Convert 'YYYY-mm-dd HH:MM:SS[.qqq[qqq]]' to datetime"
        try:
            ymd, hms = s.split(' ')
            y, m, d = [int(i) for i in ymd.split('-')]
            h, mm, s = hms.split(':')
            if '.' in s:
                s, us = s.split('.')
                us = int(us[0:6].ljust(6, '0'))
            else:
                us = 0
            h, mm, s = int(h), int(mm), int(s)
            return datetime(
                year=y, month=m, day=d, hour=h, minute=mm, second=s,
                microsecond=us)
        except Exception:
            return orig_conversion(s)
    return _timestamp_conversion


# Update conversions to make them safe from None-truncation.
CONVERSIONS = dict(
    _make_conversion_safe_tuple(it) for it in converters.conversions.items())
# Add TIMESTAMP conversion that handles 'YYYY-mm-dd HH:MM:SS[.qqq[qqq]]'.
CONVERSIONS[converters.FIELD_TYPE.TIMESTAMP] = _make_timestamp_conversion(
    CONVERSIONS[converters.FIELD_TYPE.TIMESTAMP])


class Interval(object):
    def __init__(self, db_connection, table_name, seq_column,
                 period_size, until=None, extra_where=None):
        self._db = db_connection
        self._table_name = table_name
        self._seq_column = seq_column
        self._until = until
        self._extra_where = extra_where

        self._set_min()
        self._set_max()

        if self.type is None:
            self.period = None
            self.begin = 2
            self.end = None  # no data
            self.last = 1    # no data, end before we start
        elif self.type == datetime:
            self.period = timedelta(seconds=period_size)
            self.begin = self._min_seq.replace(minute=0, second=0)
            self.end = self.begin + self.period
            self.last = self._max_seq  # inclusive!
        elif self.type == int:
            self.period = period_size
            self.begin = self._min_seq - (self._min_seq % self.period)
            self.end = self.begin + self.period
            self.last = self._max_seq  # inclusive!

        self._maybe_dst_issues = (
            self.type == datetime and period_size < 7200)

    def _set_min(self):
        self._min_seq = self._find_min('(1 = 1)')
        if self._min_seq is None:
            self.type = None
        elif isinstance(self._min_seq, datetime):
            self.type = datetime
        elif isinstance(self._min_seq, int):
            self.type = int
        else:
            raise NotImplementedError('Unhandled %s.%s type %r (%s)' % (
                self._table_name, self._seq_column, self._min_seq,
                type(self._min_seq).__name__))

    def _set_max(self):
        if self._until is None:
            where = '(1 = 1)'
        elif isinstance(self._until, date):
            where = "(%s < '%s')" % (self._seq_column, self._until.strftime(
                '%Y-%m-%d'))
        elif isinstance(self._until, datetime):
            where = "(%s < '%s')" % (self._seq_column, self._until.strftime(
                '%Y-%m-%d %H:%M:%S'))
        elif isinstance(self._until, int):
            where = "(%s < %s)" % (self._seq_column, self._until)
        else:
            raise NotImplementedError(self._until)

        query = "SELECT MAX(%s) FROM %s WHERE %s %s" % (
            self._seq_column, self._table_name, where, self._extra_where)
        # #print(query)
        cursor = self._db.cursor()
        cursor.execute(query)
        ret = cursor.fetchall()
        self._max_seq = ret[0][0]  # inclusive!

    def _find_min(self, where):
        query = "SELECT MIN(%s) FROM %s WHERE %s %s" % (
            self._seq_column, self._table_name, where, self._extra_where)
        # #print(query)
        cursor = self._db.cursor()
        cursor.execute(query)
        ret = cursor.fetchall()
        return ret[0][0]

    def set_next(self):
        if self._maybe_dst_issues and has_crossed_dst(
                self._db, self.end, self.period):
            # For certain columns, string/time comparisons can fail during
            # DST changes, for example:
            #   TABLE auth (ts timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP)
            #   mysql> SELECT MIN(ts) FROM auth\G
            #   min(ts): 2014-10-26 02:00:02
            #   > SELECT COUNT(*) FROM auth WHERE '2014-10-26 02:00:00' <= ts
            #       AND ts < '2014-10-26 03:00:00' ORDER BY ts\G
            #   COUNT(*): 0
            #   > SELECT COUNT(*) FROM auth WHERE '2014-10-26 01:59:59' <= ts
            #       AND ts < '2014-10-26 03:00:00' ORDER BY ts\G
            #   COUNT(*): 256
            # For DATETIME columns, the problem does not occur.
            # As a workaround we'll have to do an extra check to see if
            # we've just crossed a DST line and add more time to the
            # query.
            #
            # Keep begin time, but extend end time if needed.
            self.end = self.begin + TWO_HOURS
        else:
            self.begin, self.end = self.end, self.end + self.period

    def seek_next(self):
        if self._maybe_dst_issues and has_crossed_dst(
                self._db, self.begin, self.period):
            # Lower begin time, extend end time.
            self.begin = self._find_min("(%s >= '%s')" % (
                self._seq_column,
                (self.begin - self.period).strftime('%Y-%m-%d %H:%M:%S'),))
            self.begin = self.begin - ONE_HOUR
            self.end = self.begin + TWO_HOURS
        else:
            # Normal.
            self.begin = self._find_min("(%s >= '%s')" % (
                self._seq_column, self.end.strftime('%Y-%m-%d %H:%M:%S'),))
            self.end = self.begin + self.period


class FileManager(object):
    def __init__(self, filename_template, header, is_monotonic):
        self._filename_template = filename_template
        self._header = header

        # If is_monotonic, we'll assume we can close files as soon as we're
        # moving to the next file. For others, it might look like we're leaking
        # files until the last moment when close() is called.
        self._is_monotonic = is_monotonic

        self._written_files = {}
        self._files = {}

    def _open(self, filename):
        assert filename not in self._files, (filename, self._files)
        # We must open in append-mode if sort_column!=date_column.
        # If we get a record with an older date we could need to reopen
        # a file we've already written to.
        fp = open(filename, ('a' if filename in self._written_files else 'w'))
        if fp.tell() == 0:
            fp.write(self._header)  # write header because we're at pos 0
            self._written_files[filename] = time.time()
        self._files[filename] = fp
        return fp

    def _make_filename(self, date_value):
        return date_value.strftime(self._filename_template)

    def close(self):
        for filename, fp in self._files.items():
            fp.close()

            # Set last updated time after closing, to get a more
            # realistic view of when the files were populated.
            last_updated = self._written_files[filename]
            utime(filename, (last_updated, last_updated))

        self._files = {}

    def write(self, line, when):
        opened_new_file = False
        filename = self._make_filename(when)

        try:
            # Most common case first. We expect few exceptions.
            fp = self._files[filename]
        except KeyError:
            if self._is_monotonic:
                self.close()

            fp = self._open(filename)
            opened_new_file = True

        self._written_files[filename] = time.time()
        fp.write(line)
        return opened_new_file


def dump2csv_and_prune(db_connection, kwargs):
    # Get relevant args.
    (table_name, sort_column, date_column, until, output,
     period_size, quiet, sleep, prune) = tuple(
        kwargs[i] for i in (
            'table', 'sort_column', 'date_column', 'until', 'output',
            'period_size', 'quiet', 'sleep', 'prune'))

    extra_where = (kwargs.get('extra_where') or '').strip()
    if extra_where:
        extra_where = 'AND (%s)' % (extra_where.replace('%', '%%'),)

    rows = describe_table(db_connection, table_name)

    # Figure out the interval/datatypes involved.
    try:
        interval = Interval(
            db_connection, table_name, sort_column,
            period_size, until, extra_where)
    except OperationalError as e:
        if e.args[0] == 1054:
            # Unknown column 'xyz' in 'field list'
            raise ConfigError('there is no %r column in table %r' % (
                sort_column, table_name))
        raise

    # Check what column to use for the filenames.
    date_column_idx = None
    if date_column is None and interval.type == datetime:
        date_column = sort_column
        is_monotonic = True
    elif date_column is None and '%' in output:
        raise NotImplementedError(
            'output filename templates (using %) require a date_column')
    elif date_column is None:
        assert '%' not in output, output
        is_monotonic = True  # very monotonic, only one file
    else:
        is_monotonic = False
    if date_column is not None:
        try:
            date_column_idx = rows.index(date_column)
        except ValueError:
            raise ConfigError('there is no %r column in table %r' % (
                date_column, table_name))

    # Make filemanager that auto-opens files and auto-adds CSV headers.
    # This allows us to write to multiple files simultaneously. For instance
    # when using an INT index, but storing the files in a time-based filename.
    filemanager = FileManager(
        filename_template=output,
        header='%s\n' % (','.join(rows),),
        is_monotonic=is_monotonic)

    # Create cursor. Increment in 1 hour steps.
    cursor = db_connection.cursor()

    caret_angle = 0
    while interval.begin <= interval.last:
        if interval.type == datetime:
            begin_range_sql = "'%s'" % (
                interval.begin.strftime('%Y-%m-%d %H:%M:%S'),)
            end_range_sql = "'%s'" % (
                interval.end.strftime('%Y-%m-%d %H:%M:%S'),)
        else:
            begin_range_sql = interval.begin
            end_range_sql = interval.end

        # Query a bit of data and write to file.
        query = (
            "SELECT * FROM %s WHERE %s <= %s AND %s < %s %s "
            "ORDER BY %s" % (
                table_name, begin_range_sql, sort_column,
                sort_column, end_range_sql, extra_where, sort_column))
        # #print(query)
        cursor.execute(query)

        row = None
        cursorit = iter(cursor)
        try:
            row = next(cursorit)
        except StopIteration:
            # No data! Seek to first data now. Otherwise we might keep
            # scanning and scanning.
            interval.seek_next()
            continue

        # Write data (first row, and the rest).
        opened_new_file = filemanager.write(
            line='%s\n' % (','.join(to_string(i) for i in row),),
            when=row[date_column_idx])
        for row in cursorit:
            opened_new_file |= filemanager.write(
                line='%s\n' % (','.join(to_string(i) for i in row),),
                when=row[date_column_idx])
        if opened_new_file and not quiet:
            sys.stdout.write('+')  # a new file was opened
            sys.stdout.flush()

        # Prune immediately in equal sized chunks.
        if prune and row is not None:
            cursor.execute(
                "DELETE FROM %s WHERE %s <= %s AND %s < %s %s" % (
                    table_name, begin_range_sql, sort_column,
                    sort_column, end_range_sql, extra_where))
            db_connection.commit()
            if not quiet:
                sys.stdout.write('!')  # a chunk was deleted
                sys.stdout.flush()

        # Increase dates.
        interval.set_next()
        if not quiet:
            sys.stdout.write('%s\b' % (r'\|/-'[caret_angle],))  # progress
            sys.stdout.flush()
            caret_angle = (caret_angle + 1) % 4
        if sleep:
            time.sleep(sleep)

    filemanager.close()


def describe_table(db_connection, table_name):
    cursor = db_connection.cursor()
    cursor.execute('DESCRIBE %s;' % (table_name,))
    ret = []
    for row in cursor.fetchall():
        ret.append(row[0])
    return ret


def has_crossed_dst(db_connection, begin_date, period):
    cursor = db_connection.cursor()
    cursor.execute(
        "SELECT (UNIX_TIMESTAMP('%s') - UNIX_TIMESTAMP('%s')) AS v" % (
            begin_date.strftime('%Y-%m-%d %H:%M:%S'),
            (begin_date - period).strftime('%Y-%m-%d %H:%M:%S')))
    ret = cursor.fetchall()
    found = ret[0][0]
    expected = period.total_seconds()
    if found == expected:
        return False
    assert found in (expected - 3600, expected + 3600), ret
    return True


def to_string(data):
    if isinstance(data, str):
        return '"%s"' % (data.replace('"', '""'),)
    if isinstance(data, unicode):
        return '"%s"' % (data.encode('utf-8').replace('"', '""'),)
    if isinstance(data, int) or isinstance(data, long):
        return str(data)
    if isinstance(data, datetime):
        return '"%s"' % (data.strftime('%Y-%m-%d %H:%M:%S'),)
    if isinstance(data, date):  # must come after datetime
        return '"%s"' % (data.strftime('%Y-%m-%d'),)
    if isinstance(data, Decimal):
        return str(data)
    if data is None:
        return 'NULL'
    if isinstance(data, bytes):
        return '0x%s' % (b16encode(data).decode('ascii'),)  # UPPERCASE
    raise TypeError('Unexpected type %s for data %r: %s' % (
        type(data), data, data))


def parse_defaults_file(filename):
    config = RawConfigParser()
    with open(filename, 'r') as fp:
        config.read_file(fp)

    sections = config.sections()
    for possible_section in ('mysql2csv', 'mysql', 'client'):
        if possible_section in sections:
            break
    else:
        return {}

    return dict(config.items(possible_section))


def parse_date(date_):
    return datetime.fromtimestamp(
        time.mktime(time.strptime(date_, '%Y-%m-%d')))


def main():
    example_filename = DEFAULT_FILENAME % {
        'database': '<DATABASE>', 'table': '<TABLE>'}
    example_until = EXAMPLE_UNTIL_DATE.strftime('%Y-%m-%d')

    parser = argparse.ArgumentParser(
        prog='mysql2csv',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=('''\
Dumps all data from the MySQL table to a CSV-formatted file on stdout IN A NON-
LOCKING FASHION. In most cases you can safely use this on a production database
to dump old records that you will be pruning later on.

Supply the necessary HOST/USER/DATABASE to connect to, the password is prompted
on stdin if required. The data from TABLE will be exported ordered by the date
from DATE_COLUMN in ascending order.

If strftime variables are used are used in the CSV filename template, they are
replaced with the values from the fetched period. The default filename TEMPLATE
is: '%(filename)s'

The default UNTIL time is %(until)s which should be far enough in the future
to get "all" records. (It is used as "exclusive" date, meaning that no records
from that date and onwards will be output/pruned.)

You can use the date(1) shell utility to specify a relative date for --until;
for example one year ago would be: date +%%Y-%%m-%%d -d '-1 year'

Supply --prune as argument to immediately prune the data after dumping it.

''' % {'filename': example_filename, 'until': example_until}))

    parser.add_argument(
        '-H', '--host',
        help='which database host to connect to')
    parser.add_argument(
        '-p', '--port', type=int,
        help='the database port if not 3306')
    parser.add_argument(
        '-u', '--user',
        help='the username if not the current user')
    parser.add_argument(
        '-P', action='store_true',
        help='ask for password')
    parser.add_argument(
        '-s', '--socket',
        help='unix socket to connect to instead of tcp host')
    parser.add_argument(
        '-o', '--output',
        help="output filename template to use instead of '%s'" % (
            example_filename.replace('%', '%%'),))
    parser.add_argument(  # not through defaults-file
        '--until', type=parse_date, default=None,
        help='stop when arriving at this date (do not set for int columns)')
    parser.add_argument(
        '-q', '--quiet', action='store_true',
        help='be quiet, apart from showing errors')
    parser.add_argument(
        '--period-size', type=int, metavar='PERIOD', default=3600,
        help=('interval size in seconds or in integers if dealing with a '
              'sequence (maybe use 50000 for primary key sorted ones)'))
    parser.add_argument(  # not through defaults-file
        '--prune', action='store_true', default=False,
        help='prunes the data after writing the csv (DESTRUCTIVE!)')
    parser.add_argument(
        '--sleep', type=float, metavar='SECONDS',
        help=('sleep this long between every data fetch/prune; accepts '
              'fractional seconds'))
    parser.add_argument(
        '--extra-where',
        help=('additional filter to add to the SELECT and DELETE queries; '
              'for example: is_active = 0'))
    parser.add_argument(
        '-f', '--defaults-file',
        help=("a defaults-file in mysql-client fashion to set the username, "
              "password or any of the other long options; checks the "
              "[mysql2csv] section (or [mysql] or [client] if the former "
              "doesn't exist)"))
    parser.add_argument(
        'database',
        help='the database to connect to')
    parser.add_argument(
        'table',
        help='the table to dump and/or prune')
    parser.add_argument(
        'sort_column',
        help='the column that holds the indexed date or primary key')
    parser.add_argument(
        'date_column', nargs='?', default=None,
        help=('the column that holds a datetime to use for the filename '
              'template (same as sort_column if not specified)'))

    # Extract args and merge with defaults. If a defaults-file is set,
    # use that and have the command line overrule it.
    args = parser.parse_args()
    # Copy defaults-file contents.
    kwargs = {'password': None}
    if args.defaults_file:
        kwargs.update(parse_defaults_file(args.defaults_file))
        if 'port' in kwargs:
            kwargs['port'] = int(kwargs['port'])
        if 'sleep' in kwargs:
            kwargs['sleep'] = float(kwargs['sleep'])
    # Copy command line args.
    for arg in ('host', 'port', 'user', 'socket', 'output',
                'database', 'table', 'sort_column', 'date_column', 'until',
                'period_size', 'quiet', 'sleep', 'extra_where'):
        value = getattr(args, arg)
        if value is not None or arg not in kwargs:
            kwargs[arg] = value
    kwargs['prune'] = args.prune
    # Set defaults.
    if kwargs['output'] is None:
        kwargs['output'] = DEFAULT_FILENAME % {
            'database': kwargs['database'],
            'table': kwargs['table']}
    if kwargs['sleep'] is None:
        kwargs['sleep'] = DEFAULT_SLEEP_TIME

    # Get password?
    if args.P:
        kwargs['password'] = getpass.getpass('Enter password: ')

    # Extract mysql connect args.
    my_kwargs = {}
    for from_, to in (
            ('host', 'host'), ('port', 'port'), ('user', 'user'),
            ('password', 'passwd'), ('database', 'db'),
            ('socket', 'unix_socket')):
        if kwargs[from_] is not None:
            my_kwargs[to] = kwargs[from_]

    # Print a bit of info.
    if not kwargs['quiet']:
        print('MySQL server settings: %s@%s/%s' % (
            kwargs['user'] or '<nobody>',
            kwargs['socket'] or kwargs['host'] or '<local>',
            kwargs['database']))
        if kwargs['until']:
            until = kwargs['until'].strftime('%Y-%m-%d')
        else:
            until = 'end'
        print(
            'Table to dump: %s (ordered by %s, %d chunk/period, '
            'from begin to %s)' % (
                kwargs['table'], kwargs['sort_column'],
                kwargs['period_size'], until))
        print('Output filename template: "%s"' % (kwargs['output'],))
        print('Processing %s.%s (ordered by %s)' % (
            my_kwargs['db'], kwargs['table'], kwargs['sort_column']))

    # Connect and do work.
    conn = connect(conv=CONVERSIONS, **my_kwargs)
    try:
        dump2csv_and_prune(conn, kwargs)
    except ConfigError as e:
        print('\n%s: %s' % (sys.argv[0], e), file=sys.stderr)
        exit(1)
    finally:
        conn.close()

    if not kwargs['quiet']:
        print()


if __name__ == '__main__':
    main()
# vim: set ts=8 sw=4 sts=4 et ai: