Skip to content

Utils

evals_hub.utils.sampling

stratified_subsampling(dataset_dict, seed, splits=['test'], label='label', n_samples=2048)

Subsamples the dataset with stratification by the supplied label. Returns a datasetDict object.

Parameters:

Name Type Description Default
dataset_dict DatasetDict

the DatasetDict object.

required
seed int

the random seed.

required
splits list[str]

the splits of the dataset.

['test']
label str

the label with which the stratified sampling is based on.

'label'
n_samples int

Optional, number of samples to subsample. Default is max_n_samples.

2048
Source code in evals_hub/utils/sampling.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def stratified_subsampling(
    dataset_dict: DatasetDict,
    seed: int,
    splits: list[str] = ["test"],
    label: str = "label",
    n_samples: int = 2048,
) -> DatasetDict:
    """Subsamples the dataset with stratification by the supplied label.
    Returns a datasetDict object.

    Args:
        dataset_dict: the DatasetDict object.
        seed: the random seed.
        splits: the splits of the dataset.
        label: the label with which the stratified sampling is based on.
        n_samples: Optional, number of samples to subsample. Default is max_n_samples.
    """
    ## Can only do this if the label column is of ClassLabel.
    if not isinstance(dataset_dict[splits[0]].features[label], ClassLabel):
        try:
            dataset_dict = dataset_dict.class_encode_column(label)
        except ValueError as e:
            if isinstance(dataset_dict[splits[0]][label][0], Sequence):
                return _multilabel_subsampling(
                    dataset_dict, seed, splits, label, n_samples
                )
            else:
                raise e

    for split in splits:
        if n_samples >= len(dataset_dict[split]):
            logger.debug(
                f"Subsampling not needed for split {split}, as n_samples is equal or greater than the number of samples."
            )
            continue
        dataset_dict.update(
            {
                split: dataset_dict[split].train_test_split(
                    test_size=n_samples, seed=seed, stratify_by_column=label
                )["test"]
            }
        )  ## only take the specified test split.
    return dataset_dict

evals_hub.utils.utils

assign_session_id(data)

Assign session_id column to polars dataframe.

Source code in evals_hub/utils/utils.py
86
87
88
89
90
91
92
93
def assign_session_id(data: pl.DataFrame) -> pl.DataFrame:
    """
    Assign session_id column to polars dataframe.
    """
    session_ids = [uuid.uuid4().hex for _ in range(len(data))]
    # Add session_id column to the dataframe
    data = data.with_columns(pl.Series("session_id", session_ids))
    return data

backend_factory()

Configure the HTTP backend for requests to disable SSL verification.

Source code in evals_hub/utils/utils.py
28
29
30
31
32
def backend_factory() -> requests.Session:
    """Configure the HTTP backend for requests to disable SSL verification."""
    session = requests.Session()
    session.verify = False
    return session

create_repo_from_config(config_path, repo_name)

Create a Hugging Face dataset repository using configuration settings.

Parameters:

Name Type Description Default
config_path Path

Path to the Hugging Face configuration file containing: - org_name (required): Organization name for the repository - resource_group_id (optional): Resource group identifier - private_repository (optional, default True): Whether repo should be private

required
repo_name str

Name of the repository to create

required

Returns:

Name Type Description
str str

The full repository ID (org_name/repo_name)

Raises:

Type Description
HTTPError

If repository creation fails (except for 409 conflicts when repo already exists)

Source code in evals_hub/utils/utils.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def create_repo_from_config(config_path: Path, repo_name: str) -> str:
    """
    Create a Hugging Face dataset repository using configuration settings.

    Args:
        config_path: Path to the Hugging Face configuration file containing:
            - org_name (required): Organization name for the repository
            - resource_group_id (optional): Resource group identifier
            - private_repository (optional, default True): Whether repo should be private
        repo_name: Name of the repository to create

    Returns:
        str: The full repository ID (org_name/repo_name)

    Raises:
        HTTPError: If repository creation fails (except for 409 conflicts when repo already exists)
    """
    config = load_huggingface_config(config_path)
    # create a dataset repo in huggingface
    org_name = config["org_name"]
    resource_group_id = config.get("resource_group_id", None)
    private_repository = config.get("private_repository", True)

    repo_id = f"{org_name}/{repo_name}"
    try:
        create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            private=private_repository,
            resource_group_id=resource_group_id,
        )
    except HTTPError as err:
        if err.response.status_code == 409:
            logger.error(f"Repo {repo_name} already exists.")
        else:
            raise err

    return repo_id

get_device()

Automatically detect and return the best available device.

Returns:

Type Description
device

torch.device: The best available device (cuda, mps, or cpu)

Source code in evals_hub/utils/utils.py
35
36
37
38
39
40
41
42
43
44
45
46
47
def get_device() -> torch.device:
    """
    Automatically detect and return the best available device.

    Returns:
        torch.device: The best available device (cuda, mps, or cpu)
    """
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

load_huggingface_config(config_path=None)

Load Hugging Face configuration from huggingface_config.yaml.

Parameters:

Name Type Description Default
config_path Path | None

Optional path to the config file. If None, uses the default path.

None

Returns:

Name Type Description
dict dict

Configuration containing org_name and resource_group_id

Source code in evals_hub/utils/utils.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def load_huggingface_config(config_path: Path | None = None) -> dict:
    """
    Load Hugging Face configuration from huggingface_config.yaml.

    Args:
        config_path: Optional path to the config file. If None, uses the default path.

    Returns:
        dict: Configuration containing org_name and resource_group_id
    """
    # Use provided config_path if given, otherwise use default path relative to project root
    if config_path is None:
        config_path = Path(__file__).parents[3] / "cronjob" / "huggingface_config.yaml"

    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    return config

setup_logging()

Configure logging for the application.

Source code in evals_hub/utils/utils.py
19
20
21
22
23
24
25
def setup_logging() -> None:
    """Configure logging for the application."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )