Skip to content

Add DataFrameCopyAppend class #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas_to_postgres/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .copy_df import DataFrameCopy
from .copy_hdf import HDFTableCopy, SmallHDFTableCopy, BigHDFTableCopy
from .append_df import DataFrameCopyAppend
from .hdf_to_postgres import hdf_to_postgres, create_hdf_table_objects, copy_worker
from .utilities import (
hdf_metadata,
Expand Down
85 changes: 85 additions & 0 deletions pandas_to_postgres/append_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from .utilities import create_file_object, df_generator, cast_pandas
from ._base_copy import BaseCopy


class DataFrameCopyAppend(BaseCopy):
"""
Differs from main DataFrameCopy class with that it doesn't drop fks, pks and indexes, and appends df
to existing data in table. This will cause it to lose a lot of performance.
"""

def __init__(
self, df, defer_sql_objs=False, conn=None, table_obj=None, csv_chunksize=10 ** 6
):
"""
Parameters
----------
df: pandas DataFrame
Data to copy to database table
defer_sql_objs: bool
multiprocessing has issue with passing SQLALchemy objects, so if
True, defer attributing these to the object until after pickled by Pool
conn: SQlAlchemy Connection
Managed outside of the object
table_obj: SQLAlchemy model object
Destination SQL Table
csv_chunksize: int
Max rows to keep in memory when generating CSV for COPY
"""
super().__init__(defer_sql_objs, conn, table_obj, csv_chunksize)

self.df = df
self.rows = self.df.shape[0]

def truncate(self):
pass

def create_pk(self):
pass

def create_fks(self):
pass

def drop_fks(self):
pass

def drop_pk(self):
pass

def copy_from_file(self, file_object):
"""
COPY to PostgreSQL table using StringIO CSV object
Parameters
----------
file_object: StringIO
CSV formatted data to COPY from DataFrame to PostgreSQL
"""
cur = self.conn.connection.cursor()
file_object.seek(0)
columns = file_object.readline()
sql = "COPY {table} ({columns}) FROM STDIN WITH CSV".format(
table=self.sql_table, columns=columns
)
cur.copy_expert(sql=sql, file=file_object)

def copy(self, functions=[cast_pandas]):
self.drop_fks()
self.drop_pk()
self.df = self.data_formatting(self.df, functions=functions)
with self.conn.begin():
self.truncate()

self.logger.info("Creating generator for chunking dataframe")
for chunk in df_generator(self.df, self.csv_chunksize):
self.logger.info("Creating CSV in memory")
fo = create_file_object(chunk)

self.logger.info("Copying chunk to database")
self.copy_from_file(fo)
del fo

self.logger.info("All chunks copied ({} rows)".format(self.rows))

self.create_pk()
self.create_fks()
self.analyze()
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ def read(fname, lines=False):

setup(
name="pandas_to_postgres",
version="v0.0.3",
version="v0.0.4",
author="Brendan Leonard <Harvard CID>",
description=(
"Utility to copy Pandas DataFrames and DataFrames stored in HDF5 files "
"to PostgreSQL "
),
url="http://github.com/cid-harvard/pandas-to-postgres",
packages=find_packages(),
install_requires=["SQLAlchemy", "pandas", "psycopg2", "tables"],
install_requires=["SQLAlchemy", "pandas", "psycopg2-binary", "tables"],
long_description=read("README.md"),
classifiers=[
"Topic :: Database",
Expand Down