Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix rendering of zero-length DataFrames in default card #1652

Merged
merged 1 commit into from
Dec 8, 2023

Conversation

amerberg
Copy link
Contributor

@amerberg amerberg commented Dec 7, 2023

The default card currently fails to render for a run that has an artifact which is a DataFrame with 0 rows and at least 1 string column or index level. The failure arises because x.astype("string").str.len().max() returns null when x has length zero.

Here is a trial pipeline:

from metaflow import FlowSpec, step, Parameter, IncludeFile, catch, card
import math, time, uuid, datetime, random, string, sys
from decimal import Decimal
import requests

class CustomClass():

    def __str__(self):
        return 'a' * int(1024**2)

    def __repr__(self):
        return str(self)

class DefaultCardFlow(FlowSpec):

    str_param = Parameter('str_param', default='刺身は美味しい')

    file_param = IncludeFile('file_param')

    json_param = Parameter('json_param', default='{"states": {[{"CA", 0}, {"NY", 1}]}')

    float_param = Parameter('float_param', default=math.pi)

    @card
    @step
    def start(self):
        """
        This step creates a bunch of artifacts of various kinds. They
        should show up nicely on the default card 🔬.
        """
        self.python_objects()
        self.images()
        self.raise_exception()
        self.large_python_objects()
        self.custom_python_objects()
        self.pandas()
        self.numpy()
        self.next(self.end)

    @step
    def end(self):
        """
        The end.
        """
        pass

    def python_objects(self):
        self.py_int = 434
        self.py_float = math.pi
        self.py_complex = complex(1,2)
        self.py_list = [1,2,3]
        self.py_tuple = (1,2,3)
        self.py_range = range(10)
        self.py_str = '刺身は美味しい'
        self.py_bytes = b'\x00\x01\x02'
        self.py_bytearray = bytearray(b'\xf0\xf1\xf2')
        self.py_set = {1,2,3}
        self.py_frozenset = frozenset({4,5,6})
        self.py_dict = {'a': 1, 'null': None, True: False}
        self.py_type = type(str)
        self.py_bool = True
        self.py_none = None

    def large_python_objects(self):
        self.large_dict = {}
        for suit in ['clubs', 'diamonds', 'hearts', 'spades']:
            self.large_dict[suit] = ['ace'] +\
                                    list(range(2, 10)) +\
                                    ['jack', 'queen', 'king']

        self.large_int = 2**65

        # Large string (may be truncated)
        self.large_str = requests.get('https://www.usconstitution.net/const.txt').text

        # Large dictionary with many keys (may be truncated)
        self.large_dict_many_keys = {str(uuid.uuid4()): time.time()
                                     for _ in range(1000000)}

        # Large dictionary with a large value (may be truncated)
        self.large_dict_large_val = {'constitution': self.large_str}

        # Large dictionary (may be truncated)
        self.large_dict_deep = d = {}
        for i in range(100):
            d[i] = d = {}
        d['bottom!'] = True

        # Large blob
        self.large_blob = b'\x00' * (100 * 1024**2)

    def custom_python_objects(self):
        # A python object from stdlib (just print repr())
        self.custom_datetime = datetime.datetime.utcnow()
        # A custom Python object
        self.custom_class = CustomClass()
        # A custom Python object (just print repr())
        self.custom_decimal = Decimal(0.1)

    def images(self):
        # A gif file
        self.img_gif = requests.get('https://www.gif-vif.com/hacker-cat.gif').content
        # A jpg file
        self.img_jpg = requests.get('https://www.nasa.gov/centers/goddard/images/content/638831main_globe_east_2048.jpg').content
        # A png file
        self.img_png = requests.get('https://datavisdotblog.files.wordpress.com/2019/08/small-multiples.png').content

    def raise_exception(self):
        try:
            raise Exception('This is an exception!')
        except Exception as x:
            # Exception object
            # We could print traceback too:
            # traceback.format_tb(self.exception.__traceback__)
            self.exception = x

    def pandas(self):
        from datetime import datetime
        import pandas
        d = {'this is column %s' % x: [random.randint(1, 10**i) for _ in range(1000)]
             for i, x in enumerate(string.ascii_uppercase)}
        d['nulls'] = [None] * 1000
        d['times'] = [datetime.utcnow()] * 1000
        # Pandas series of timestamps
        d['timestamps'] = pandas.date_range('1/1/2000', periods=1000)

        self.dataframe = pandas.DataFrame(d)

        import pandas as pd
        import numpy as np
        from datetime import datetime

        # Create sample data
        data = {
            'Integers': [1, 2, 3, 4, 5],
            'Floats': [1.1, 2.2, np.nan, 4.4, 5.5],
            'Strings': ['A', 'B', 'C', 'D', 'E'],
            'Booleans': [True, False, True, False, True],
            'DateTime': [datetime(2021, 1, 1), datetime(2021, 2, 1), pd.NaT, datetime(2021, 4, 1), datetime(2021, 5, 1)],
            'Timedelta': pd.to_timedelta([1, 2, 3, 4, np.nan], unit='D'),
            'Categorical': pd.Categorical(['cat', 'dog', 'fish', 'bird', 'snake']),
            'Object': [np.random.random((10,10)), np.array([3, 4]), np.array([5, 6]), np.array([7, 8]), np.array([9, 10])],
            'Period': [pd.Period('2021Q1'), pd.Period('2021Q2'), pd.Period('2021Q3'), pd.Period('2021Q4'), pd.Period('2022Q1')],
            'Sparse': pd.Series(pd.arrays.SparseArray([0, 1, 0, 2, 0])),
            'Intervals': pd.arrays.IntervalArray.from_tuples([(1, 2), (2, 3), (4, 5), (6, 7), (8, 9)]),
            'Complex': [1 + 2j, 2 + 3j, 3 + 4j, 4 + 5j, 5 + 6j],
            'Bytes': [b'A', b'B', b'C', b'D', b'E'],
            "Nulls": [None, None, None, None, None],
            'Timestamp': [pd.Timestamp('2021-01-01'), pd.Timestamp('2021-02-01'), pd.Timestamp('2021-03-01'), pd.Timestamp('2021-04-01'), pd.Timestamp('2021-05-01')]
        }

        # Create DataFrame
        self.dataframe2 = pd.DataFrame(data)
        self.empty_dataframe = pd.DataFrame({"a": [], "b": []}, pd.Index([], dtype=object))

    def numpy(self):
        import numpy
        self.np_array = numpy.arange(10000000, dtype='u8')

if __name__ == '__main__':
    DefaultCardFlow()

@savingoyal savingoyal merged commit 3b765a7 into Netflix:master Dec 8, 2023
21 of 22 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants