Skip to content

Data Profile

Data Profile

Opsml DataInterfaces support ydata-profiling with an optional extra.

poetry add opsml[profiling]

To add a data profile to your interface you can either supply a custom data profile created through the ydata-profiling library or you can call the create_data_profile method after DataInterface instantiation. Note - you can also call create_data_profile from a DataCard after instantiation (example below). The create_data_profile is optimized for performance, and thus, will omit certain analyses by defualt (interactions, character/word analysis, etc.). If you'd like more control over what analyses are conducted, it is recommended that you create a custom report via ydata-profiling and provide it to the DataCard using the data_profile arg.

Example of create_data_profile

# Data
from sklearn.datasets import load_linnerud

# Opsml
from opsml import CardInfo, DataCard, CardRegistry, PandasData

data, target = load_linnerud(return_X_y=True, as_frame=True)
data["Pulse"] = target.Pulse

interface = PandasData(data=data)

# create data profile from interface
interface.create_data_profile(sample_perc=0.5) # you can specify a sampling percentage between 0 and 1

card_info = CardInfo(name="linnerrud", repository="opsml", contact="user@email.com")
data_card = DataCard(info=card_info, data=data)

# this also works
data_card.create_data_profile(sample_perc=0.5) 

# if youd like to view you're report, you can export it to html or json
# Jupyter notebooks will render the html without needing to save (just call data_card.data_profile)
# data_card.data_profile.to_file("my_report.html")

# Registering card will automatically save the report and its html
data_registry = CardRegistry(registry_name="data")
data_registry.register_card(card=data_card)

Example of providing your own custom data profile

from ydata_profiling import ProfileReport
from opsml import PandasData 

data, target = load_linnerud(return_X_y=True, as_frame=True)
data["Pulse"] = target.Pulse

data_profile = ProfileReport(data, title="Profiling Report")
interface = PandasData(data=data, data_profile=data_profile)

Comparing data profiles

You can also leverage Opsmls thin profiling wrapper for comparing different data profiles

from sklearn.datasets import load_linnerud
import numpy as np

# Opsml
from opsml import PandasData
from opsml.profile import DataProfiler

data, target = load_linnerud(return_X_y=True, as_frame=True)
data["Pulse"] = target.Pulse

# Simulate creating 1st DataCard
interface = PandasData(data=data)
interface.create_data_profile()

# Simulate creating 2nd DataCard
data2 = data * np.random.rand(data.shape[1])
card_info = CardInfo(name="linnerrud", repository="opsml", contact="user@email.com")
interface2 = PandasData(data=data)
interface2.create_data_profile()

comparison = DataProfiler.compare_reports(reports=[data_card.data_profile, data_card2.data_profile])
comparison.to_file("comparison_report.html")

Docs

opsml.profile.DataProfiler

Source code in opsml/profile/profile_data.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class DataProfiler:
    @staticmethod
    def create_profile_report(
        data: Union[pd.DataFrame, pl.DataFrame],
        name: str,
        sample_perc: float = 1,
    ) -> ProfileReport:
        """
        Creates a `ydata-profiling` report

        Args:
            data:
                Pandas dataframe
            sample_perc:
                Percentage to use for sampling
            name:
                Name of the report

        Returns:
            `ProfileReport`
        """
        from ydata_profiling import ProfileReport

        kwargs = {"title": f"Profile report for {name}"}

        if isinstance(data, pl.DataFrame):
            if sample_perc < 1:
                return ProfileReport(
                    df=data.sample(fraction=sample_perc, with_replacement=False, shuffle=True).to_pandas(),
                    config_file=os.path.join(DIR_PATH, "profile_config.yml"),
                    lazy=False,
                    **kwargs,
                )

            return ProfileReport(
                df=data.to_pandas(),
                config_file=os.path.join(DIR_PATH, "profile_config.yml"),
                lazy=False,
                **kwargs,
            )

        if sample_perc < 1:
            return ProfileReport(
                df=data.sample(frac=sample_perc, replace=False),
                config_file=os.path.join(DIR_PATH, "profile_config.yml"),
                lazy=False,
                **kwargs,
            )

        return ProfileReport(
            df=data,
            config_file=os.path.join(DIR_PATH, "profile_config.yml"),
            lazy=False,
            **kwargs,
        )

    @staticmethod
    def load_profile(data: bytes) -> ProfileReport:
        """Loads a `ProfileReport` from data bytes

        Args:
            data:
                `ProfileReport` in bytes

        Returns:
            `ProfileReport`
        """
        from ydata_profiling import ProfileReport

        profile = ProfileReport()
        profile.loads(data)
        return profile

    @staticmethod
    def compare_reports(reports: List[ProfileReport]) -> ProfileReport:
        """Compares ProfileReports

        Args:
            reports:
                List of `ProfileReport`

        Returns:
            `ProfileReport`
        """
        from ydata_profiling import compare

        return compare(reports=reports)

create_profile_report(data, name, sample_perc=1) staticmethod

Creates a ydata-profiling report

Parameters:

Name Type Description Default
data Union[DataFrame, DataFrame]

Pandas dataframe

required
sample_perc float

Percentage to use for sampling

1
name str

Name of the report

required

Returns:

Type Description
ProfileReport

ProfileReport

Source code in opsml/profile/profile_data.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@staticmethod
def create_profile_report(
    data: Union[pd.DataFrame, pl.DataFrame],
    name: str,
    sample_perc: float = 1,
) -> ProfileReport:
    """
    Creates a `ydata-profiling` report

    Args:
        data:
            Pandas dataframe
        sample_perc:
            Percentage to use for sampling
        name:
            Name of the report

    Returns:
        `ProfileReport`
    """
    from ydata_profiling import ProfileReport

    kwargs = {"title": f"Profile report for {name}"}

    if isinstance(data, pl.DataFrame):
        if sample_perc < 1:
            return ProfileReport(
                df=data.sample(fraction=sample_perc, with_replacement=False, shuffle=True).to_pandas(),
                config_file=os.path.join(DIR_PATH, "profile_config.yml"),
                lazy=False,
                **kwargs,
            )

        return ProfileReport(
            df=data.to_pandas(),
            config_file=os.path.join(DIR_PATH, "profile_config.yml"),
            lazy=False,
            **kwargs,
        )

    if sample_perc < 1:
        return ProfileReport(
            df=data.sample(frac=sample_perc, replace=False),
            config_file=os.path.join(DIR_PATH, "profile_config.yml"),
            lazy=False,
            **kwargs,
        )

    return ProfileReport(
        df=data,
        config_file=os.path.join(DIR_PATH, "profile_config.yml"),
        lazy=False,
        **kwargs,
    )

compare_reports(reports) staticmethod

Compares ProfileReports

Parameters:

Name Type Description Default
reports List[ProfileReport]

List of ProfileReport

required

Returns:

Type Description
ProfileReport

ProfileReport

Source code in opsml/profile/profile_data.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@staticmethod
def compare_reports(reports: List[ProfileReport]) -> ProfileReport:
    """Compares ProfileReports

    Args:
        reports:
            List of `ProfileReport`

    Returns:
        `ProfileReport`
    """
    from ydata_profiling import compare

    return compare(reports=reports)