Skip to content

regdatadescription

regdatadescription

Module for RegDataDescription

Classes

FeatureType

FeatureTypes are used for defining which kind of features are available.

Functions
get_available_features classmethod
get_available_features()

Returns list of available feature types

Source code in niceml/data/datadescriptions/regdatadescription.py
@classmethod
def get_available_features(cls) -> List[str]:
    """Returns list of available feature types"""
    return [cls.SCALAR, cls.CATEGORICAL, cls.BINARY]

RegDataDescription dataclass

Bases: InputVectorDataDescription, OutputVectorDataDescription

DataDescription for Regression data. Uses vectors as input and output

Functions
get_dict
get_dict()

Returns dictionary of inputs and targets

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_dict(self) -> dict:
    """Returns dictionary of inputs and targets"""
    return dict(inputs=self.inputs, targets=self.targets)
get_input_entry_names
get_input_entry_names()

Returns names of input entries

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_input_entry_names(self) -> List[str]:
    """Returns names of input entries"""
    input_keys: List[str] = []
    for cur_input in self.inputs:
        if cur_input["type"] in [FeatureType.SCALAR, FeatureType.BINARY]:
            input_keys.append(cur_input["key"])
        elif cur_input["type"] == FeatureType.CATEGORICAL:
            input_keys += [
                f"{cur_input['key']}{x:03d}"
                for x in range(cur_input["value_count"])
            ]

    return input_keys
get_input_size
get_input_size()

Returns size of input vector(s)

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_input_size(self) -> int:
    """Returns size of input vector(s)"""
    return get_feature_size(self.inputs)
get_min_max_vals
get_min_max_vals()

Get min and max values for categorical and binary input values

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_min_max_vals(self) -> Dict[str, Tuple[int, int]]:
    """Get min and max values for categorical and binary input values"""
    min_max_dict: Dict[str, Tuple[int, int]] = {}
    for input_vector in self.inputs:
        if input_vector["type"] == FeatureType.BINARY:
            min_max_dict[input_vector["key"]] = (0, 1)
        elif input_vector["type"] == FeatureType.CATEGORICAL:
            min_max_dict[input_vector["key"]] = (0, input_vector["value_count"] - 1)

    return min_max_dict
get_output_entry_names
get_output_entry_names()

Returns names of targets

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_output_entry_names(self) -> List[str]:
    """Returns names of targets"""
    target_keys = []
    for target in self.targets:
        if target["type"] != "scalar":
            raise ValueError("Target feature type is not scalar")
        target_keys.append(target["key"])
    return target_keys
get_output_size
get_output_size()

Returns size of output vector(s) (targets)

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_output_size(self) -> int:
    """Returns size of output vector(s) (targets)"""
    return get_feature_size(self.targets)

Functions

get_feature_size

get_feature_size(features)

Returns size of features in 'features' dictionary

Source code in niceml/data/datadescriptions/regdatadescription.py
def get_feature_size(features: List[dict]) -> int:
    """Returns size of features in 'features' dictionary"""
    count = 0
    for feature in features:
        feature_type = feature["type"]
        if feature_type not in FeatureType.get_available_features():
            raise ModeNotImplementedError(
                f"Feature type {feature['type']} not implemented"
            )
        count += (
            1
            if feature_type in [FeatureType.BINARY, FeatureType.SCALAR]
            else feature["value_count"]
        )
    return count

inputs_prefix_factory

inputs_prefix_factory(
    data_location,
    prefix,
    feature_type,
    data_file_name="train.parq",
)

The inputs_prefix_factory function is a factory function that returns a list of input features as dictionaries.

Parameters:

  • data_location (Union[dict, LocationConfig]) –

    Specify the location of the data

  • prefix (str) –

    Filter the columns in the dataframe

  • feature_type (str) –

    Specify the type of feature

  • data_file_name (str, default: 'train.parq' ) –

    Specify the name of the file to be read from data_location

Returns: A list of input features as dictionaries

Source code in niceml/data/datadescriptions/regdatadescription.py
def inputs_prefix_factory(
    data_location: Union[dict, LocationConfig],
    prefix: str,
    feature_type: str,
    data_file_name: str = "train.parq",
) -> List[dict]:
    """
    The inputs_prefix_factory function is a factory function that returns a list of
    input features as dictionaries.

    Args:
        data_location: Specify the location of the data
        prefix: Filter the columns in the dataframe
        feature_type: Specify the type of feature
        data_file_name: Specify the name of the file to be read from data_location
    Returns:
        A list of input features as dictionaries
    """
    with open_location(data_location) as (
        data_fs,
        data_root,
    ):
        try:
            loaded_data = read_parquet(
                filepath=join_fs_path(data_fs, data_root, data_file_name),
                file_system=data_fs,
            )
            return [
                {"key": column, "type": feature_type}
                for column in loaded_data.columns
                if column.startswith(prefix)
            ]
        except FileNotFoundError:
            logger = logging.getLogger(__name__)
            logger.warning("Data file not found. Inputs will be empty.")
        return []

load_data_infos

load_data_infos(yaml_path)

Loads and returns RegDataDescription from yaml-path

Source code in niceml/data/datadescriptions/regdatadescription.py
def load_data_infos(yaml_path: str) -> RegDataDescription:  # QUEST: still used?
    """Loads and returns RegDataDescription from yaml-path"""
    with open(yaml_path, "r") as file:
        data = yaml.load(file, Loader=yaml.SafeLoader)
    return RegDataDescription(**data)

reg_data_description_factory

reg_data_description_factory(
    train_data_location,
    train_set_file_name,
    filter_function,
    **kwargs
)

The reg_data_description_factory function is a factory function that returns a RegDataDescription object.The RegDataDescription object contains the inputs and targets of the regression data set. The reg_data_description_factory function takes in arguments for: - train_data_location: The location of the training data set - train_set_file name: The name of the training data set file - filter function: A filtering function to apply to each row in order to extract input and target features from it

Parameters:

  • train_data_location (Union[dict, LocationConfig]) –

    The location of the training data set

  • train_set_file_name (str) –

    The name of the training data set file

  • filter_function (FunctionType) –

    A filtering function to apply to each row in order to extract input and target features from it

  • **kwargs

    Pass in additional arguments to the filter_functions

Returns:

  • RegDataDescription

    A RagDataDescription with inputs and targets created by the filter_function

Source code in niceml/data/datadescriptions/regdatadescription.py
def reg_data_description_factory(
    train_data_location: Union[dict, LocationConfig],
    train_set_file_name: str,
    filter_function: FunctionType,
    **kwargs,
) -> RegDataDescription:
    """
    The reg_data_description_factory function is a factory function that returns a
    RegDataDescription object.The RegDataDescription object contains the inputs and targets
    of the regression data set.
    The reg_data_description_factory function takes in arguments for:
        - train_data_location: The location of the training data set
        - train_set_file name: The name of the training data set file
        - filter function: A filtering function to apply to each row in order to
                            extract input and target features from it

    Args:
        train_data_location: The location of the training data set
        train_set_file_name: The name of the training data set file
        filter_function: A filtering function to apply to each row in order to
                        extract input and target features from it
        **kwargs: Pass in additional arguments to the filter_functions

    Returns:
        A RagDataDescription with inputs and targets created by the filter_function
    """
    with open_location(train_data_location) as (
        regression_data_fs,
        regression_data_root,
    ):
        train_data = read_parquet(
            filepath=join_fs_path(
                regression_data_fs, regression_data_root, train_set_file_name
            ),
            file_system=regression_data_fs,
        )

        inputs: List[Dict[str, str]]
        targets: List[Dict[str, str]]

        inputs, targets = filter_function(data=train_data, **kwargs)

        return RegDataDescription(inputs=inputs, targets=targets)