regdatadescription

regdatadescription ¶

Module for RegDataDescription

Classes¶

FeatureType ¶

FeatureTypes are used for defining which kind of features are available.

Functions¶

get_available_features `classmethod` ¶

get_available_features()

Returns list of available feature types

Source code in niceml/data/datadescriptions/regdatadescription.py

@classmethod
def get_available_features(cls) -> List[str]:
    """Returns list of available feature types"""
    return [cls.SCALAR, cls.CATEGORICAL, cls.BINARY]

RegDataDescription `dataclass` ¶

Bases: InputVectorDataDescription, OutputVectorDataDescription

DataDescription for Regression data. Uses vectors as input and output

Functions¶

get_dict ¶

get_dict()

Returns dictionary of inputs and targets

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_dict(self) -> dict:
    """Returns dictionary of inputs and targets"""
    return dict(inputs=self.inputs, targets=self.targets)

get_input_entry_names ¶

get_input_entry_names()

Returns names of input entries

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_input_entry_names(self) -> List[str]:
    """Returns names of input entries"""
    input_keys: List[str] = []
    for cur_input in self.inputs:
        if cur_input["type"] in [FeatureType.SCALAR, FeatureType.BINARY]:
            input_keys.append(cur_input["key"])
        elif cur_input["type"] == FeatureType.CATEGORICAL:
            input_keys += [
                f"{cur_input['key']}{x:03d}"
                for x in range(cur_input["value_count"])
            ]

    return input_keys

get_input_size ¶

get_input_size()

Returns size of input vector(s)

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_input_size(self) -> int:
    """Returns size of input vector(s)"""
    return get_feature_size(self.inputs)

get_min_max_vals ¶

get_min_max_vals()

Get min and max values for categorical and binary input values

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_min_max_vals(self) -> Dict[str, Tuple[int, int]]:
    """Get min and max values for categorical and binary input values"""
    min_max_dict: Dict[str, Tuple[int, int]] = {}
    for input_vector in self.inputs:
        if input_vector["type"] == FeatureType.BINARY:
            min_max_dict[input_vector["key"]] = (0, 1)
        elif input_vector["type"] == FeatureType.CATEGORICAL:
            min_max_dict[input_vector["key"]] = (0, input_vector["value_count"] - 1)

    return min_max_dict

get_output_entry_names ¶

get_output_entry_names()

Returns names of targets

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_output_entry_names(self) -> List[str]:
    """Returns names of targets"""
    target_keys = []
    for target in self.targets:
        if target["type"] != "scalar":
            raise ValueError("Target feature type is not scalar")
        target_keys.append(target["key"])
    return target_keys

get_output_size ¶

get_output_size()

Returns size of output vector(s) (targets)

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_output_size(self) -> int:
    """Returns size of output vector(s) (targets)"""
    return get_feature_size(self.targets)

Functions¶

get_feature_size ¶

get_feature_size(features)

Returns size of features in 'features' dictionary

Source code in niceml/data/datadescriptions/regdatadescription.py

def get_feature_size(features: List[dict]) -> int:
    """Returns size of features in 'features' dictionary"""
    count = 0
    for feature in features:
        feature_type = feature["type"]
        if feature_type not in FeatureType.get_available_features():
            raise ModeNotImplementedError(
                f"Feature type {feature['type']} not implemented"
            )
        count += (
            1
            if feature_type in [FeatureType.BINARY, FeatureType.SCALAR]
            else feature["value_count"]
        )
    return count

inputs_prefix_factory ¶

inputs_prefix_factory(
    data_location,
    prefix,
    feature_type,
    data_file_name="train.parq",
)

The inputs_prefix_factory function is a factory function that returns a list of input features as dictionaries.

Parameters:

data_location (Union[dict, LocationConfig]) –

Specify the location of the data
prefix (str) –

Filter the columns in the dataframe
feature_type (str) –

Specify the type of feature
data_file_name (str, default: 'train.parq' ) –

Specify the name of the file to be read from data_location

Returns: A list of input features as dictionaries

Source code in niceml/data/datadescriptions/regdatadescription.py

def inputs_prefix_factory(
    data_location: Union[dict, LocationConfig],
    prefix: str,
    feature_type: str,
    data_file_name: str = "train.parq",
) -> List[dict]:
    """
    The inputs_prefix_factory function is a factory function that returns a list of
    input features as dictionaries.

    Args:
        data_location: Specify the location of the data
        prefix: Filter the columns in the dataframe
        feature_type: Specify the type of feature
        data_file_name: Specify the name of the file to be read from data_location
    Returns:
        A list of input features as dictionaries
    """
    with open_location(data_location) as (
        data_fs,
        data_root,
    ):
        try:
            loaded_data = read_parquet(
                filepath=join_fs_path(data_fs, data_root, data_file_name),
                file_system=data_fs,
            )
            return [
                {"key": column, "type": feature_type}
                for column in loaded_data.columns
                if column.startswith(prefix)
            ]
        except FileNotFoundError:
            logger = logging.getLogger(__name__)
            logger.warning("Data file not found. Inputs will be empty.")
        return []

load_data_infos ¶

load_data_infos(yaml_path)

Loads and returns RegDataDescription from yaml-path

Source code in niceml/data/datadescriptions/regdatadescription.py

def load_data_infos(yaml_path: str) -> RegDataDescription:  # QUEST: still used?
    """Loads and returns RegDataDescription from yaml-path"""
    with open(yaml_path, "r") as file:
        data = yaml.load(file, Loader=yaml.SafeLoader)
    return RegDataDescription(**data)

reg_data_description_factory ¶

reg_data_description_factory(
    train_data_location,
    train_set_file_name,
    filter_function,
    **kwargs
)

The reg_data_description_factory function is a factory function that returns a RegDataDescription object.The RegDataDescription object contains the inputs and targets of the regression data set. The reg_data_description_factory function takes in arguments for: - train_data_location: The location of the training data set - train_set_file name: The name of the training data set file - filter function: A filtering function to apply to each row in order to extract input and target features from it

Parameters:

train_data_location (Union[dict, LocationConfig]) –

The location of the training data set
train_set_file_name (str) –

The name of the training data set file
filter_function (FunctionType) –

A filtering function to apply to each row in order to extract input and target features from it
**kwargs –

Pass in additional arguments to the filter_functions

Returns:

RegDataDescription –

A RagDataDescription with inputs and targets created by the filter_function

Source code in niceml/data/datadescriptions/regdatadescription.py

def reg_data_description_factory(
    train_data_location: Union[dict, LocationConfig],
    train_set_file_name: str,
    filter_function: FunctionType,
    **kwargs,
) -> RegDataDescription:
    """
    The reg_data_description_factory function is a factory function that returns a
    RegDataDescription object.The RegDataDescription object contains the inputs and targets
    of the regression data set.
    The reg_data_description_factory function takes in arguments for:
        - train_data_location: The location of the training data set
        - train_set_file name: The name of the training data set file
        - filter function: A filtering function to apply to each row in order to
                            extract input and target features from it

    Args:
        train_data_location: The location of the training data set
        train_set_file_name: The name of the training data set file
        filter_function: A filtering function to apply to each row in order to
                        extract input and target features from it
        **kwargs: Pass in additional arguments to the filter_functions

    Returns:
        A RagDataDescription with inputs and targets created by the filter_function
    """
    with open_location(train_data_location) as (
        regression_data_fs,
        regression_data_root,
    ):
        train_data = read_parquet(
            filepath=join_fs_path(
                regression_data_fs, regression_data_root, train_set_file_name
            ),
            file_system=regression_data_fs,
        )

        inputs: List[Dict[str, str]]
        targets: List[Dict[str, str]]

        inputs, targets = filter_function(data=train_data, **kwargs)

        return RegDataDescription(inputs=inputs, targets=targets)

regdatadescription