opsml.cards.data API documentation

interface: Annotated[Union[opsml.data.interfaces._base.DataInterface, opsml.data.interfaces.custom_data.base.Dataset], SerializeAsAny()]

metadata: opsml.types.data.DataCardMetadata

def load_data(self, **kwargs: Union[str, int]) -> None: View Source

66    def load_data(self, **kwargs: Union[str, int]) -> None:  # pylint: disable=differing-param-doc
67        """
68        Load data to interface
69
70        Args:
71            kwargs:
72                Keyword arguments to pass to the data loader
73
74            ---- Supported kwargs for ImageData and TextDataset ----
75
76            split:
77                Split to use for data. If not provided, then all data will be loaded.
78                Only used for subclasses of `Dataset`.
79
80            batch_size:
81                What batch size to use when loading data. Only used for subclasses of `Dataset`.
82                Defaults to 1000.
83
84            chunk_size:
85                How many files per batch to use when writing arrow back to local file.
86                Defaults to 1000.
87
88                Example:
89
90                    - If batch_size=1000 and chunk_size=100, then the loaded batch will be split into
91                    10 chunks to write in parallel. This is useful for large datasets.
92
93        """
94        from opsml.storage.card_loader import DataCardLoader
95
96        DataCardLoader(self).load_data(**kwargs)

Load data to interface

Arguments:

kwargs: Keyword arguments to pass to the data loader
---- Supported kwargs for ImageData and TextDataset ----
split: Split to use for data. If not provided, then all data will be loaded. Only used for subclasses of Dataset.
batch_size: What batch size to use when loading data. Only used for subclasses of Dataset. Defaults to 1000.
chunk_size: How many files per batch to use when writing arrow back to local file. Defaults to 1000.

Example:
- If batch_size=1000 and chunk_size=100, then the loaded batch will be split into 10 chunks to write in parallel. This is useful for large datasets.

def load_data_profile(self) -> None: View Source

 98    def load_data_profile(self) -> None:
 99        """
100        Load data to interface
101        """
102        from opsml.storage.card_loader import DataCardLoader
103
104        DataCardLoader(self).load_data_profile()

Load data to interface

def create_registry_record(self) -> Dict[str, Any]: View Source

106    def create_registry_record(self) -> Dict[str, Any]:
107        """
108        Creates required metadata for registering the current data card.
109        Implemented with a DataRegistry object.
110            Returns:
111            Registry metadata
112        """
113        exclude_attr = {"data"}
114        return self.model_dump(exclude=exclude_attr)

Creates required metadata for registering the current data card. Implemented with a DataRegistry object. Returns: Registry metadata

def add_info(self, info: Dict[str, Union[float, int, str]]) -> None: View Source

116    def add_info(self, info: Dict[str, Union[float, int, str]]) -> None:
117        """
118        Adds metadata to the existing DataCard metadata dictionary
119
120        Args:
121            info:
122                Dictionary containing name (str) and value (float, int, str) pairs
123                to add to the current metadata set
124        """
125
126        self.metadata.additional_info = {**info, **self.metadata.additional_info}

Adds metadata to the existing DataCard metadata dictionary

Arguments:

info: Dictionary containing name (str) and value (float, int, str) pairs to add to the current metadata set

def create_data_profile( self, sample_perc: float = 1) -> ydata_profiling.profile_report.ProfileReport: View Source

128    def create_data_profile(self, sample_perc: float = 1) -> ProfileReport:
129        """Creates a data profile report
130
131        Args:
132            sample_perc:
133                Percentage of data to use when creating a profile. Sampling is recommended for large dataframes.
134                Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)
135
136        """
137        assert isinstance(
138            self.interface, DataInterface
139        ), "Data profile can only be created for a DataInterface subclasses"
140        self.interface.create_data_profile(sample_perc, str(self.name))

Creates a data profile report

Arguments:

sample_perc: Percentage of data to use when creating a profile. Sampling is recommended for large dataframes. Percentage is expressed as a decimal (e.g. 1 = 100%, 0.5 = 50%, etc.)

def split_data(self) -> Dict[str, opsml.data.splitter.Data]: View Source

142    def split_data(self) -> Dict[str, Data]:
143        """Splits data interface according to data split logic"""
144
145        assert isinstance(self.interface, DataInterface), "Splitting is only support for DataInterface subclasses"
146        if self.data is None:
147            self.load_data()
148
149        return self.interface.split_data()

Splits data interface according to data split logic

data_splits: List[opsml.data.splitter.DataSplit] View Source

151    @property
152    def data_splits(self) -> List[DataSplit]:
153        """Returns data splits"""
154        assert isinstance(self.interface, DataInterface), "Data splits are only supported for DataInterface subclasses"
155        return self.interface.data_splits

Returns data splits

data: Any View Source

157    @property
158    def data(self) -> Any:
159        """Returns data"""
160        assert isinstance(
161            self.interface, DataInterface
162        ), "Data attribute is only supported for DataInterface subclasses"
163        return self.interface.data

Returns data

data_profile: Any View Source

165    @property
166    def data_profile(self) -> Any:
167        """Returns data profile"""
168        assert isinstance(self.interface, DataInterface), "Data profile is only supported for DataInterface subclasses"
169        return self.interface.data_profile

Returns data profile

card_type: str View Source

171    @property
172    def card_type(self) -> str:
173        return CardType.DATACARD.value

model_config = {'arbitrary_types_allowed': True, 'validate_assignment': False, 'validate_default': True}

model_fields = {'name': FieldInfo(annotation=str, required=False, default='undefined'), 'repository': FieldInfo(annotation=str, required=False, default='undefined'), 'contact': FieldInfo(annotation=str, required=False, default='undefined'), 'version': FieldInfo(annotation=str, required=False, default='0.0.0'), 'uid': FieldInfo(annotation=Union[str, NoneType], required=False), 'info': FieldInfo(annotation=Union[CardInfo, NoneType], required=False), 'tags': FieldInfo(annotation=Dict[str, str], required=False, default={}), 'interface': FieldInfo(annotation=Union[DataInterface, Dataset], required=True, metadata=[SerializeAsAny()]), 'metadata': FieldInfo(annotation=DataCardMetadata, required=False, default=DataCardMetadata(interface_type='', data_type='', description=Description(summary=None, sample_code=None, Notes=None), feature_map={}, additional_info={}, runcard_uid=None, pipelinecard_uid=None, auditcard_uid=None))}

model_computed_fields = {}

opsml.cards.data

Arguments:

Returns:

Arguments:

Arguments:

Arguments:

Inherited Members