opsml.cards.data
1# Copyright (c) Shipt, Inc. 2# This source code is licensed under the MIT license found in the 3# LICENSE file in the root directory of this source tree. 4 5# IMPORTANT: We need `Optional` imported here in order for Pydantic to be able to 6# deserialize DataCard. 7# 8from typing import ( # noqa # pylint: disable=unused-import 9 Any, 10 Dict, 11 List, 12 Optional, 13 Union, 14) 15 16from pydantic import SerializeAsAny 17 18from opsml.cards.base import ArtifactCard 19from opsml.data import Dataset 20from opsml.data.interfaces._base import DataInterface 21from opsml.data.splitter import Data, DataSplit 22from opsml.helpers.logging import ArtifactLogger 23from opsml.types import CardType, DataCardMetadata 24 25try: 26 from ydata_profiling import ProfileReport 27except ModuleNotFoundError: 28 ProfileReport = Any 29 30logger = ArtifactLogger.get_logger() 31 32 33class DataCard(ArtifactCard): 34 """Create a DataCard from your data. 35 36 Args: 37 interface: 38 Instance of `DataInterface` that contains data 39 name: 40 What to name the data 41 repository: 42 Repository that this data is associated with 43 contact: 44 Contact to associate with data card 45 info: 46 `CardInfo` object containing additional metadata. If provided, it will override any 47 values provided for `name`, `repository`, `contact`, and `version`. 48 49 Name, repository, and contact are required arguments for all cards. They can be provided 50 directly or through a `CardInfo` object. 51 52 version: 53 DataCard version 54 uid: 55 Unique id assigned to the DataCard 56 57 Returns: 58 DataCard 59 60 """ 61 62 interface: SerializeAsAny[Union[DataInterface, Dataset]] 63 metadata: DataCardMetadata = DataCardMetadata() 64 65 def load_data(self, **kwargs: Union[str, int]) -> None: # pylint: disable=differing-param-doc 66 """ 67 Load data to interface 68 69 Args: 70 kwargs: 71 Keyword arguments to pass to the data loader 72 73 ---- Supported kwargs for ImageData and TextDataset ---- 74 75 split: 76 Split to use for data. If not provided, then all data will be loaded. 77 Only used for subclasses of `Dataset`. 78 79 batch_size: 80 What batch size to use when loading data. Only used for subclasses of `Dataset`. 81 Defaults to 1000. 82 83 chunk_size: 84 How many files per batch to use when writing arrow back to local file. 85 Defaults to 1000. 86 87 Example: 88 89 - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 90 10 chunks to write in parallel. This is useful for large datasets. 91 92 """ 93 from opsml.storage.card_loader import DataCardLoader 94 95 DataCardLoader(self).load_data(**kwargs) 96 97 def load_data_profile(self) -> None: 98 """ 99 Load data to interface 100 """ 101 from opsml.storage.card_loader import DataCardLoader 102 103 DataCardLoader(self).load_data_profile() 104 105 def create_registry_record(self) -> Dict[str, Any]: 106 """ 107 Creates required metadata for registering the current data card. 108 Implemented with a DataRegistry object. 109 Returns: 110 Registry metadata 111 """ 112 exclude_attr = {"data"} 113 return self.model_dump(exclude=exclude_attr) 114 115 def add_info(self, info: Dict[str, Union[float, int, str]]) -> None: 116 """ 117 Adds metadata to the existing DataCard metadata dictionary 118 119 Args: 120 info: 121 Dictionary containing name (str) and value (float, int, str) pairs 122 to add to the current metadata set 123 """ 124 125 self.metadata.additional_info = {**info, **self.metadata.additional_info} 126 127 def create_data_profile(self, sample_perc: float = 1) -> ProfileReport: 128 """Creates a data profile report 129 130 Args: 131 sample_perc: 132 Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. 133 Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.) 134 135 """ 136 assert isinstance( 137 self.interface, DataInterface 138 ), "Data profile can only be created for a DataInterface subclasses" 139 self.interface.create_data_profile(sample_perc, str(self.name)) 140 141 def split_data(self) -> Dict[str, Data]: 142 """Splits data interface according to data split logic""" 143 144 assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses" 145 if self.data is None: 146 self.load_data() 147 148 return self.interface.split_data() 149 150 @property 151 def data_splits(self) -> List[DataSplit]: 152 """Returns data splits""" 153 assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses" 154 return self.interface.data_splits 155 156 @property 157 def data(self) -> Any: 158 """Returns data""" 159 assert isinstance( 160 self.interface, DataInterface 161 ), "Data attribute is only supported for DataInterface subclasses" 162 return self.interface.data 163 164 @property 165 def data_profile(self) -> Any: 166 """Returns data profile""" 167 assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses" 168 return self.interface.data_profile 169 170 @property 171 def card_type(self) -> str: 172 return CardType.DATACARD.value
34class DataCard(ArtifactCard): 35 """Create a DataCard from your data. 36 37 Args: 38 interface: 39 Instance of `DataInterface` that contains data 40 name: 41 What to name the data 42 repository: 43 Repository that this data is associated with 44 contact: 45 Contact to associate with data card 46 info: 47 `CardInfo` object containing additional metadata. If provided, it will override any 48 values provided for `name`, `repository`, `contact`, and `version`. 49 50 Name, repository, and contact are required arguments for all cards. They can be provided 51 directly or through a `CardInfo` object. 52 53 version: 54 DataCard version 55 uid: 56 Unique id assigned to the DataCard 57 58 Returns: 59 DataCard 60 61 """ 62 63 interface: SerializeAsAny[Union[DataInterface, Dataset]] 64 metadata: DataCardMetadata = DataCardMetadata() 65 66 def load_data(self, **kwargs: Union[str, int]) -> None: # pylint: disable=differing-param-doc 67 """ 68 Load data to interface 69 70 Args: 71 kwargs: 72 Keyword arguments to pass to the data loader 73 74 ---- Supported kwargs for ImageData and TextDataset ---- 75 76 split: 77 Split to use for data. If not provided, then all data will be loaded. 78 Only used for subclasses of `Dataset`. 79 80 batch_size: 81 What batch size to use when loading data. Only used for subclasses of `Dataset`. 82 Defaults to 1000. 83 84 chunk_size: 85 How many files per batch to use when writing arrow back to local file. 86 Defaults to 1000. 87 88 Example: 89 90 - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 91 10 chunks to write in parallel. This is useful for large datasets. 92 93 """ 94 from opsml.storage.card_loader import DataCardLoader 95 96 DataCardLoader(self).load_data(**kwargs) 97 98 def load_data_profile(self) -> None: 99 """ 100 Load data to interface 101 """ 102 from opsml.storage.card_loader import DataCardLoader 103 104 DataCardLoader(self).load_data_profile() 105 106 def create_registry_record(self) -> Dict[str, Any]: 107 """ 108 Creates required metadata for registering the current data card. 109 Implemented with a DataRegistry object. 110 Returns: 111 Registry metadata 112 """ 113 exclude_attr = {"data"} 114 return self.model_dump(exclude=exclude_attr) 115 116 def add_info(self, info: Dict[str, Union[float, int, str]]) -> None: 117 """ 118 Adds metadata to the existing DataCard metadata dictionary 119 120 Args: 121 info: 122 Dictionary containing name (str) and value (float, int, str) pairs 123 to add to the current metadata set 124 """ 125 126 self.metadata.additional_info = {**info, **self.metadata.additional_info} 127 128 def create_data_profile(self, sample_perc: float = 1) -> ProfileReport: 129 """Creates a data profile report 130 131 Args: 132 sample_perc: 133 Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. 134 Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.) 135 136 """ 137 assert isinstance( 138 self.interface, DataInterface 139 ), "Data profile can only be created for a DataInterface subclasses" 140 self.interface.create_data_profile(sample_perc, str(self.name)) 141 142 def split_data(self) -> Dict[str, Data]: 143 """Splits data interface according to data split logic""" 144 145 assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses" 146 if self.data is None: 147 self.load_data() 148 149 return self.interface.split_data() 150 151 @property 152 def data_splits(self) -> List[DataSplit]: 153 """Returns data splits""" 154 assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses" 155 return self.interface.data_splits 156 157 @property 158 def data(self) -> Any: 159 """Returns data""" 160 assert isinstance( 161 self.interface, DataInterface 162 ), "Data attribute is only supported for DataInterface subclasses" 163 return self.interface.data 164 165 @property 166 def data_profile(self) -> Any: 167 """Returns data profile""" 168 assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses" 169 return self.interface.data_profile 170 171 @property 172 def card_type(self) -> str: 173 return CardType.DATACARD.value
Create a DataCard from your data.
Arguments:
- interface: Instance of
DataInterface
that contains data - name: What to name the data
- repository: Repository that this data is associated with
- contact: Contact to associate with data card
info:
CardInfo
object containing additional metadata. If provided, it will override any values provided forname
,repository
,contact
, andversion
.Name, repository, and contact are required arguments for all cards. They can be provided directly or through a
CardInfo
object.- version: DataCard version
- uid: Unique id assigned to the DataCard
Returns:
DataCard
66 def load_data(self, **kwargs: Union[str, int]) -> None: # pylint: disable=differing-param-doc 67 """ 68 Load data to interface 69 70 Args: 71 kwargs: 72 Keyword arguments to pass to the data loader 73 74 ---- Supported kwargs for ImageData and TextDataset ---- 75 76 split: 77 Split to use for data. If not provided, then all data will be loaded. 78 Only used for subclasses of `Dataset`. 79 80 batch_size: 81 What batch size to use when loading data. Only used for subclasses of `Dataset`. 82 Defaults to 1000. 83 84 chunk_size: 85 How many files per batch to use when writing arrow back to local file. 86 Defaults to 1000. 87 88 Example: 89 90 - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 91 10 chunks to write in parallel. This is useful for large datasets. 92 93 """ 94 from opsml.storage.card_loader import DataCardLoader 95 96 DataCardLoader(self).load_data(**kwargs)
Load data to interface
Arguments:
- kwargs: Keyword arguments to pass to the data loader
- ---- Supported kwargs for ImageData and TextDataset ----
- split: Split to use for data. If not provided, then all data will be loaded.
Only used for subclasses of
Dataset
. - batch_size: What batch size to use when loading data. Only used for subclasses of
Dataset
. Defaults to 1000. chunk_size: How many files per batch to use when writing arrow back to local file. Defaults to 1000.
Example:
- If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 10 chunks to write in parallel. This is useful for large datasets.
98 def load_data_profile(self) -> None: 99 """ 100 Load data to interface 101 """ 102 from opsml.storage.card_loader import DataCardLoader 103 104 DataCardLoader(self).load_data_profile()
Load data to interface
106 def create_registry_record(self) -> Dict[str, Any]: 107 """ 108 Creates required metadata for registering the current data card. 109 Implemented with a DataRegistry object. 110 Returns: 111 Registry metadata 112 """ 113 exclude_attr = {"data"} 114 return self.model_dump(exclude=exclude_attr)
Creates required metadata for registering the current data card. Implemented with a DataRegistry object. Returns: Registry metadata
116 def add_info(self, info: Dict[str, Union[float, int, str]]) -> None: 117 """ 118 Adds metadata to the existing DataCard metadata dictionary 119 120 Args: 121 info: 122 Dictionary containing name (str) and value (float, int, str) pairs 123 to add to the current metadata set 124 """ 125 126 self.metadata.additional_info = {**info, **self.metadata.additional_info}
Adds metadata to the existing DataCard metadata dictionary
Arguments:
- info: Dictionary containing name (str) and value (float, int, str) pairs to add to the current metadata set
128 def create_data_profile(self, sample_perc: float = 1) -> ProfileReport: 129 """Creates a data profile report 130 131 Args: 132 sample_perc: 133 Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. 134 Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.) 135 136 """ 137 assert isinstance( 138 self.interface, DataInterface 139 ), "Data profile can only be created for a DataInterface subclasses" 140 self.interface.create_data_profile(sample_perc, str(self.name))
Creates a data profile report
Arguments:
- sample_perc: Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
142 def split_data(self) -> Dict[str, Data]: 143 """Splits data interface according to data split logic""" 144 145 assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses" 146 if self.data is None: 147 self.load_data() 148 149 return self.interface.split_data()
Splits data interface according to data split logic
151 @property 152 def data_splits(self) -> List[DataSplit]: 153 """Returns data splits""" 154 assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses" 155 return self.interface.data_splits
Returns data splits
157 @property 158 def data(self) -> Any: 159 """Returns data""" 160 assert isinstance( 161 self.interface, DataInterface 162 ), "Data attribute is only supported for DataInterface subclasses" 163 return self.interface.data
Returns data
165 @property 166 def data_profile(self) -> Any: 167 """Returns data profile""" 168 assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses" 169 return self.interface.data_profile
Returns data profile
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- model_extra
- model_fields_set
- model_construct
- model_copy
- model_dump
- model_dump_json
- model_json_schema
- model_parametrized_name
- model_post_init
- model_rebuild
- model_validate
- model_validate_json
- model_validate_strings
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs