opsml.cards.data

  1# Copyright (c) Shipt, Inc.
  2# This source code is licensed under the MIT license found in the
  3# LICENSE file in the root directory of this source tree.
  4
  5# IMPORTANT: We need `Optional` imported here in order for Pydantic to be able to
  6# deserialize DataCard.
  7#
  8from typing import (  # noqa # pylint: disable=unused-import
  9    Any,
 10    Dict,
 11    List,
 12    Optional,
 13    Union,
 14)
 15
 16from pydantic import SerializeAsAny
 17
 18from opsml.cards.base import ArtifactCard
 19from opsml.data import Dataset
 20from opsml.data.interfaces._base import DataInterface
 21from opsml.data.splitter import Data, DataSplit
 22from opsml.helpers.logging import ArtifactLogger
 23from opsml.types import CardType, DataCardMetadata
 24
 25try:
 26    from ydata_profiling import ProfileReport
 27except ModuleNotFoundError:
 28    ProfileReport = Any
 29
 30logger = ArtifactLogger.get_logger()
 31
 32
 33class DataCard(ArtifactCard):
 34    """Create a DataCard from your data.
 35
 36    Args:
 37        interface:
 38            Instance of `DataInterface` that contains data
 39        name:
 40            What to name the data
 41        repository:
 42            Repository that this data is associated with
 43        contact:
 44            Contact to associate with data card
 45        info:
 46            `CardInfo` object containing additional metadata. If provided, it will override any
 47            values provided for `name`, `repository`, `contact`, and `version`.
 48
 49            Name, repository, and contact are required arguments for all cards. They can be provided
 50            directly or through a `CardInfo` object.
 51
 52        version:
 53            DataCard version
 54        uid:
 55            Unique id assigned to the DataCard
 56
 57    Returns:
 58        DataCard
 59
 60    """
 61
 62    interface: SerializeAsAny[Union[DataInterface, Dataset]]
 63    metadata: DataCardMetadata = DataCardMetadata()
 64
 65    def load_data(self, **kwargs: Union[str, int]) -> None:  # pylint: disable=differing-param-doc
 66        """
 67        Load data to interface
 68
 69        Args:
 70            kwargs:
 71                Keyword arguments to pass to the data loader
 72
 73            ---- Supported kwargs for ImageData and TextDataset ----
 74
 75            split:
 76                Split to use for data. If not provided, then all data will be loaded.
 77                Only used for subclasses of `Dataset`.
 78
 79            batch_size:
 80                What batch size to use when loading data. Only used for subclasses of `Dataset`.
 81                Defaults to 1000.
 82
 83            chunk_size:
 84                How many files per batch to use when writing arrow back to local file.
 85                Defaults to 1000.
 86
 87                Example:
 88
 89                    - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into
 90                    10 chunks to write in parallel. This is useful for large datasets.
 91
 92        """
 93        from opsml.storage.card_loader import DataCardLoader
 94
 95        DataCardLoader(self).load_data(**kwargs)
 96
 97    def load_data_profile(self) -> None:
 98        """
 99        Load data to interface
100        """
101        from opsml.storage.card_loader import DataCardLoader
102
103        DataCardLoader(self).load_data_profile()
104
105    def create_registry_record(self) -> Dict[str, Any]:
106        """
107        Creates required metadata for registering the current data card.
108        Implemented with a DataRegistry object.
109            Returns:
110            Registry metadata
111        """
112        exclude_attr = {"data"}
113        return self.model_dump(exclude=exclude_attr)
114
115    def add_info(self, info: Dict[str, Union[float, int, str]]) -> None:
116        """
117        Adds metadata to the existing DataCard metadata dictionary
118
119        Args:
120            info:
121                Dictionary containing name (str) and value (float, int, str) pairs
122                to add to the current metadata set
123        """
124
125        self.metadata.additional_info = {**info, **self.metadata.additional_info}
126
127    def create_data_profile(self, sample_perc: float = 1) -> ProfileReport:
128        """Creates a data profile report
129
130        Args:
131            sample_perc:
132                Percentage of data to use when creating a profile. Sampling is recommended for large dataframes.
133                Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
134
135        """
136        assert isinstance(
137            self.interface, DataInterface
138        ), "Data profile can only be created for a DataInterface subclasses"
139        self.interface.create_data_profile(sample_perc, str(self.name))
140
141    def split_data(self) -> Dict[str, Data]:
142        """Splits data interface according to data split logic"""
143
144        assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses"
145        if self.data is None:
146            self.load_data()
147
148        return self.interface.split_data()
149
150    @property
151    def data_splits(self) -> List[DataSplit]:
152        """Returns data splits"""
153        assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses"
154        return self.interface.data_splits
155
156    @property
157    def data(self) -> Any:
158        """Returns data"""
159        assert isinstance(
160            self.interface, DataInterface
161        ), "Data attribute is only supported for DataInterface subclasses"
162        return self.interface.data
163
164    @property
165    def data_profile(self) -> Any:
166        """Returns data profile"""
167        assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses"
168        return self.interface.data_profile
169
170    @property
171    def card_type(self) -> str:
172        return CardType.DATACARD.value
logger = <builtins.Logger object>
class DataCard(opsml.cards.base.ArtifactCard):
 34class DataCard(ArtifactCard):
 35    """Create a DataCard from your data.
 36
 37    Args:
 38        interface:
 39            Instance of `DataInterface` that contains data
 40        name:
 41            What to name the data
 42        repository:
 43            Repository that this data is associated with
 44        contact:
 45            Contact to associate with data card
 46        info:
 47            `CardInfo` object containing additional metadata. If provided, it will override any
 48            values provided for `name`, `repository`, `contact`, and `version`.
 49
 50            Name, repository, and contact are required arguments for all cards. They can be provided
 51            directly or through a `CardInfo` object.
 52
 53        version:
 54            DataCard version
 55        uid:
 56            Unique id assigned to the DataCard
 57
 58    Returns:
 59        DataCard
 60
 61    """
 62
 63    interface: SerializeAsAny[Union[DataInterface, Dataset]]
 64    metadata: DataCardMetadata = DataCardMetadata()
 65
 66    def load_data(self, **kwargs: Union[str, int]) -> None:  # pylint: disable=differing-param-doc
 67        """
 68        Load data to interface
 69
 70        Args:
 71            kwargs:
 72                Keyword arguments to pass to the data loader
 73
 74            ---- Supported kwargs for ImageData and TextDataset ----
 75
 76            split:
 77                Split to use for data. If not provided, then all data will be loaded.
 78                Only used for subclasses of `Dataset`.
 79
 80            batch_size:
 81                What batch size to use when loading data. Only used for subclasses of `Dataset`.
 82                Defaults to 1000.
 83
 84            chunk_size:
 85                How many files per batch to use when writing arrow back to local file.
 86                Defaults to 1000.
 87
 88                Example:
 89
 90                    - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into
 91                    10 chunks to write in parallel. This is useful for large datasets.
 92
 93        """
 94        from opsml.storage.card_loader import DataCardLoader
 95
 96        DataCardLoader(self).load_data(**kwargs)
 97
 98    def load_data_profile(self) -> None:
 99        """
100        Load data to interface
101        """
102        from opsml.storage.card_loader import DataCardLoader
103
104        DataCardLoader(self).load_data_profile()
105
106    def create_registry_record(self) -> Dict[str, Any]:
107        """
108        Creates required metadata for registering the current data card.
109        Implemented with a DataRegistry object.
110            Returns:
111            Registry metadata
112        """
113        exclude_attr = {"data"}
114        return self.model_dump(exclude=exclude_attr)
115
116    def add_info(self, info: Dict[str, Union[float, int, str]]) -> None:
117        """
118        Adds metadata to the existing DataCard metadata dictionary
119
120        Args:
121            info:
122                Dictionary containing name (str) and value (float, int, str) pairs
123                to add to the current metadata set
124        """
125
126        self.metadata.additional_info = {**info, **self.metadata.additional_info}
127
128    def create_data_profile(self, sample_perc: float = 1) -> ProfileReport:
129        """Creates a data profile report
130
131        Args:
132            sample_perc:
133                Percentage of data to use when creating a profile. Sampling is recommended for large dataframes.
134                Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
135
136        """
137        assert isinstance(
138            self.interface, DataInterface
139        ), "Data profile can only be created for a DataInterface subclasses"
140        self.interface.create_data_profile(sample_perc, str(self.name))
141
142    def split_data(self) -> Dict[str, Data]:
143        """Splits data interface according to data split logic"""
144
145        assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses"
146        if self.data is None:
147            self.load_data()
148
149        return self.interface.split_data()
150
151    @property
152    def data_splits(self) -> List[DataSplit]:
153        """Returns data splits"""
154        assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses"
155        return self.interface.data_splits
156
157    @property
158    def data(self) -> Any:
159        """Returns data"""
160        assert isinstance(
161            self.interface, DataInterface
162        ), "Data attribute is only supported for DataInterface subclasses"
163        return self.interface.data
164
165    @property
166    def data_profile(self) -> Any:
167        """Returns data profile"""
168        assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses"
169        return self.interface.data_profile
170
171    @property
172    def card_type(self) -> str:
173        return CardType.DATACARD.value

Create a DataCard from your data.

Arguments:
  • interface: Instance of DataInterface that contains data
  • name: What to name the data
  • repository: Repository that this data is associated with
  • contact: Contact to associate with data card
  • info: CardInfo object containing additional metadata. If provided, it will override any values provided for name, repository, contact, and version.

    Name, repository, and contact are required arguments for all cards. They can be provided directly or through a CardInfo object.

  • version: DataCard version
  • uid: Unique id assigned to the DataCard
Returns:

DataCard

interface: Annotated[Union[opsml.data.interfaces._base.DataInterface, opsml.data.interfaces.custom_data.base.Dataset], SerializeAsAny()]
metadata: opsml.types.data.DataCardMetadata
def load_data(self, **kwargs: Union[str, int]) -> None:
66    def load_data(self, **kwargs: Union[str, int]) -> None:  # pylint: disable=differing-param-doc
67        """
68        Load data to interface
69
70        Args:
71            kwargs:
72                Keyword arguments to pass to the data loader
73
74            ---- Supported kwargs for ImageData and TextDataset ----
75
76            split:
77                Split to use for data. If not provided, then all data will be loaded.
78                Only used for subclasses of `Dataset`.
79
80            batch_size:
81                What batch size to use when loading data. Only used for subclasses of `Dataset`.
82                Defaults to 1000.
83
84            chunk_size:
85                How many files per batch to use when writing arrow back to local file.
86                Defaults to 1000.
87
88                Example:
89
90                    - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into
91                    10 chunks to write in parallel. This is useful for large datasets.
92
93        """
94        from opsml.storage.card_loader import DataCardLoader
95
96        DataCardLoader(self).load_data(**kwargs)

Load data to interface

Arguments:
  • kwargs: Keyword arguments to pass to the data loader
  • ---- Supported kwargs for ImageData and TextDataset ----
  • split: Split to use for data. If not provided, then all data will be loaded. Only used for subclasses of Dataset.
  • batch_size: What batch size to use when loading data. Only used for subclasses of Dataset. Defaults to 1000.
  • chunk_size: How many files per batch to use when writing arrow back to local file. Defaults to 1000.

    Example:

    • If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 10 chunks to write in parallel. This is useful for large datasets.
def load_data_profile(self) -> None:
 98    def load_data_profile(self) -> None:
 99        """
100        Load data to interface
101        """
102        from opsml.storage.card_loader import DataCardLoader
103
104        DataCardLoader(self).load_data_profile()

Load data to interface

def create_registry_record(self) -> Dict[str, Any]:
106    def create_registry_record(self) -> Dict[str, Any]:
107        """
108        Creates required metadata for registering the current data card.
109        Implemented with a DataRegistry object.
110            Returns:
111            Registry metadata
112        """
113        exclude_attr = {"data"}
114        return self.model_dump(exclude=exclude_attr)

Creates required metadata for registering the current data card. Implemented with a DataRegistry object. Returns: Registry metadata

def add_info(self, info: Dict[str, Union[float, int, str]]) -> None:
116    def add_info(self, info: Dict[str, Union[float, int, str]]) -> None:
117        """
118        Adds metadata to the existing DataCard metadata dictionary
119
120        Args:
121            info:
122                Dictionary containing name (str) and value (float, int, str) pairs
123                to add to the current metadata set
124        """
125
126        self.metadata.additional_info = {**info, **self.metadata.additional_info}

Adds metadata to the existing DataCard metadata dictionary

Arguments:
  • info: Dictionary containing name (str) and value (float, int, str) pairs to add to the current metadata set
def create_data_profile( self, sample_perc: float = 1) -> ydata_profiling.profile_report.ProfileReport:
128    def create_data_profile(self, sample_perc: float = 1) -> ProfileReport:
129        """Creates a data profile report
130
131        Args:
132            sample_perc:
133                Percentage of data to use when creating a profile. Sampling is recommended for large dataframes.
134                Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
135
136        """
137        assert isinstance(
138            self.interface, DataInterface
139        ), "Data profile can only be created for a DataInterface subclasses"
140        self.interface.create_data_profile(sample_perc, str(self.name))

Creates a data profile report

Arguments:
  • sample_perc: Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
def split_data(self) -> Dict[str, opsml.data.splitter.Data]:
142    def split_data(self) -> Dict[str, Data]:
143        """Splits data interface according to data split logic"""
144
145        assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses"
146        if self.data is None:
147            self.load_data()
148
149        return self.interface.split_data()

Splits data interface according to data split logic

data_splits: List[opsml.data.splitter.DataSplit]
151    @property
152    def data_splits(self) -> List[DataSplit]:
153        """Returns data splits"""
154        assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses"
155        return self.interface.data_splits

Returns data splits

data: Any
157    @property
158    def data(self) -> Any:
159        """Returns data"""
160        assert isinstance(
161            self.interface, DataInterface
162        ), "Data attribute is only supported for DataInterface subclasses"
163        return self.interface.data

Returns data

data_profile: Any
165    @property
166    def data_profile(self) -> Any:
167        """Returns data profile"""
168        assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses"
169        return self.interface.data_profile

Returns data profile

card_type: str
171    @property
172    def card_type(self) -> str:
173        return CardType.DATACARD.value
model_config = {'arbitrary_types_allowed': True, 'validate_assignment': False, 'validate_default': True}
model_fields = {'name': FieldInfo(annotation=str, required=False, default='undefined'), 'repository': FieldInfo(annotation=str, required=False, default='undefined'), 'contact': FieldInfo(annotation=str, required=False, default='undefined'), 'version': FieldInfo(annotation=str, required=False, default='0.0.0'), 'uid': FieldInfo(annotation=Union[str, NoneType], required=False), 'info': FieldInfo(annotation=Union[CardInfo, NoneType], required=False), 'tags': FieldInfo(annotation=Dict[str, str], required=False, default={}), 'interface': FieldInfo(annotation=Union[DataInterface, Dataset], required=True, metadata=[SerializeAsAny()]), 'metadata': FieldInfo(annotation=DataCardMetadata, required=False, default=DataCardMetadata(interface_type='', data_type='', description=Description(summary=None, sample_code=None, Notes=None), feature_map={}, additional_info={}, runcard_uid=None, pipelinecard_uid=None, auditcard_uid=None))}
model_computed_fields = {}
Inherited Members
pydantic.main.BaseModel
BaseModel
model_extra
model_fields_set
model_construct
model_copy
model_dump
model_dump_json
model_json_schema
model_parametrized_name
model_post_init
model_rebuild
model_validate
model_validate_json
model_validate_strings
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
opsml.cards.base.ArtifactCard
name
repository
contact
version
uid
info
tags
validate_args
add_tag
uri
artifact_uri