opsml.profile.profile_data
1# pylint: disable=redefined-outer-name,import-outside-toplevel 2# Copyright (c) Shipt, Inc. 3# This source code is licensed under the MIT license found in the 4# LICENSE file in the root directory of this source tree. 5 6import os 7from typing import Any, List, Union 8 9import pandas as pd 10import polars as pl 11 12DIR_PATH = os.path.dirname(__file__) 13ProfileReport = Any 14 15 16class DataProfiler: 17 @staticmethod 18 def create_profile_report( 19 data: Union[pd.DataFrame, pl.DataFrame], 20 name: str, 21 sample_perc: float = 1, 22 ) -> ProfileReport: 23 """ 24 Creates a `ydata-profiling` report 25 26 Args: 27 data: 28 Pandas dataframe 29 sample_perc: 30 Percentage to use for sampling 31 name: 32 Name of the report 33 34 Returns: 35 `ProfileReport` 36 """ 37 from ydata_profiling import ProfileReport 38 39 kwargs = {"title": f"Profile report for {name}"} 40 41 if isinstance(data, pl.DataFrame): 42 if sample_perc < 1: 43 return ProfileReport( 44 df=data.sample(fraction=sample_perc, with_replacement=False, shuffle=True).to_pandas(), 45 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 46 lazy=False, 47 **kwargs, 48 ) 49 50 return ProfileReport( 51 df=data.to_pandas(), 52 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 53 lazy=False, 54 **kwargs, 55 ) 56 57 if sample_perc < 1: 58 return ProfileReport( 59 df=data.sample(frac=sample_perc, replace=False), 60 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 61 lazy=False, 62 **kwargs, 63 ) 64 65 return ProfileReport( 66 df=data, 67 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 68 lazy=False, 69 **kwargs, 70 ) 71 72 @staticmethod 73 def load_profile(data: bytes) -> ProfileReport: 74 """Loads a `ProfileReport` from data bytes 75 76 Args: 77 data: 78 `ProfileReport` in bytes 79 80 Returns: 81 `ProfileReport` 82 """ 83 from ydata_profiling import ProfileReport 84 85 profile = ProfileReport() 86 profile.loads(data) 87 return profile 88 89 @staticmethod 90 def compare_reports(reports: List[ProfileReport]) -> ProfileReport: 91 """Compares ProfileReports 92 93 Args: 94 reports: 95 List of `ProfileReport` 96 97 Returns: 98 `ProfileReport` 99 """ 100 from ydata_profiling import compare 101 102 return compare(reports=reports)
DIR_PATH =
'/home/steven_forrester/github/opsml/opsml/profile'
ProfileReport =
typing.Any
class
DataProfiler:
17class DataProfiler: 18 @staticmethod 19 def create_profile_report( 20 data: Union[pd.DataFrame, pl.DataFrame], 21 name: str, 22 sample_perc: float = 1, 23 ) -> ProfileReport: 24 """ 25 Creates a `ydata-profiling` report 26 27 Args: 28 data: 29 Pandas dataframe 30 sample_perc: 31 Percentage to use for sampling 32 name: 33 Name of the report 34 35 Returns: 36 `ProfileReport` 37 """ 38 from ydata_profiling import ProfileReport 39 40 kwargs = {"title": f"Profile report for {name}"} 41 42 if isinstance(data, pl.DataFrame): 43 if sample_perc < 1: 44 return ProfileReport( 45 df=data.sample(fraction=sample_perc, with_replacement=False, shuffle=True).to_pandas(), 46 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 47 lazy=False, 48 **kwargs, 49 ) 50 51 return ProfileReport( 52 df=data.to_pandas(), 53 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 54 lazy=False, 55 **kwargs, 56 ) 57 58 if sample_perc < 1: 59 return ProfileReport( 60 df=data.sample(frac=sample_perc, replace=False), 61 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 62 lazy=False, 63 **kwargs, 64 ) 65 66 return ProfileReport( 67 df=data, 68 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 69 lazy=False, 70 **kwargs, 71 ) 72 73 @staticmethod 74 def load_profile(data: bytes) -> ProfileReport: 75 """Loads a `ProfileReport` from data bytes 76 77 Args: 78 data: 79 `ProfileReport` in bytes 80 81 Returns: 82 `ProfileReport` 83 """ 84 from ydata_profiling import ProfileReport 85 86 profile = ProfileReport() 87 profile.loads(data) 88 return profile 89 90 @staticmethod 91 def compare_reports(reports: List[ProfileReport]) -> ProfileReport: 92 """Compares ProfileReports 93 94 Args: 95 reports: 96 List of `ProfileReport` 97 98 Returns: 99 `ProfileReport` 100 """ 101 from ydata_profiling import compare 102 103 return compare(reports=reports)
@staticmethod
def
create_profile_report( data: Union[pandas.core.frame.DataFrame, polars.dataframe.frame.DataFrame], name: str, sample_perc: float = 1) -> Any:
18 @staticmethod 19 def create_profile_report( 20 data: Union[pd.DataFrame, pl.DataFrame], 21 name: str, 22 sample_perc: float = 1, 23 ) -> ProfileReport: 24 """ 25 Creates a `ydata-profiling` report 26 27 Args: 28 data: 29 Pandas dataframe 30 sample_perc: 31 Percentage to use for sampling 32 name: 33 Name of the report 34 35 Returns: 36 `ProfileReport` 37 """ 38 from ydata_profiling import ProfileReport 39 40 kwargs = {"title": f"Profile report for {name}"} 41 42 if isinstance(data, pl.DataFrame): 43 if sample_perc < 1: 44 return ProfileReport( 45 df=data.sample(fraction=sample_perc, with_replacement=False, shuffle=True).to_pandas(), 46 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 47 lazy=False, 48 **kwargs, 49 ) 50 51 return ProfileReport( 52 df=data.to_pandas(), 53 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 54 lazy=False, 55 **kwargs, 56 ) 57 58 if sample_perc < 1: 59 return ProfileReport( 60 df=data.sample(frac=sample_perc, replace=False), 61 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 62 lazy=False, 63 **kwargs, 64 ) 65 66 return ProfileReport( 67 df=data, 68 config_file=os.path.join(DIR_PATH, "profile_config.yml"), 69 lazy=False, 70 **kwargs, 71 )
Creates a ydata-profiling
report
Arguments:
- data: Pandas dataframe
- sample_perc: Percentage to use for sampling
- name: Name of the report
Returns:
@staticmethod
def
load_profile(data: bytes) -> Any:
73 @staticmethod 74 def load_profile(data: bytes) -> ProfileReport: 75 """Loads a `ProfileReport` from data bytes 76 77 Args: 78 data: 79 `ProfileReport` in bytes 80 81 Returns: 82 `ProfileReport` 83 """ 84 from ydata_profiling import ProfileReport 85 86 profile = ProfileReport() 87 profile.loads(data) 88 return profile
@staticmethod
def
compare_reports(reports: List[Any]) -> Any: