Utils

evals_hub.utils.sampling

`stratified_subsampling(dataset_dict, seed, splits=['test'], label='label', n_samples=2048)`

Subsamples the dataset with stratification by the supplied label. Returns a datasetDict object.

Parameters:

Name	Type	Description	Default
`dataset_dict`	`DatasetDict`	the DatasetDict object.	required
`seed`	`int`	the random seed.	required
`splits`	`list[str]`	the splits of the dataset.	`['test']`
`label`	`str`	the label with which the stratified sampling is based on.	`'label'`
`n_samples`	`int`	Optional, number of samples to subsample. Default is max_n_samples.	`2048`

Source code in evals_hub/utils/sampling.py

def stratified_subsampling(
    dataset_dict: DatasetDict,
    seed: int,
    splits: list[str] = ["test"],
    label: str = "label",
    n_samples: int = 2048,
) -> DatasetDict:
    """Subsamples the dataset with stratification by the supplied label.
    Returns a datasetDict object.

    Args:
        dataset_dict: the DatasetDict object.
        seed: the random seed.
        splits: the splits of the dataset.
        label: the label with which the stratified sampling is based on.
        n_samples: Optional, number of samples to subsample. Default is max_n_samples.
    """
    ## Can only do this if the label column is of ClassLabel.
    if not isinstance(dataset_dict[splits[0]].features[label], ClassLabel):
        try:
            dataset_dict = dataset_dict.class_encode_column(label)
        except ValueError as e:
            if isinstance(dataset_dict[splits[0]][label][0], Sequence):
                return _multilabel_subsampling(
                    dataset_dict, seed, splits, label, n_samples
                )
            else:
                raise e

    for split in splits:
        if n_samples >= len(dataset_dict[split]):
            logger.debug(
                f"Subsampling not needed for split {split}, as n_samples is equal or greater than the number of samples."
            )
            continue
        dataset_dict.update(
            {
                split: dataset_dict[split].train_test_split(
                    test_size=n_samples, seed=seed, stratify_by_column=label
                )["test"]
            }
        )  ## only take the specified test split.
    return dataset_dict

evals_hub.utils.utils

`assign_session_id(data)`

Assign session_id column to polars dataframe.

Source code in evals_hub/utils/utils.py

def assign_session_id(data: pl.DataFrame) -> pl.DataFrame:
    """
    Assign session_id column to polars dataframe.
    """
    session_ids = [uuid.uuid4().hex for _ in range(len(data))]
    # Add session_id column to the dataframe
    data = data.with_columns(pl.Series("session_id", session_ids))
    return data

`backend_factory()`

Configure the HTTP backend for requests to disable SSL verification.

Source code in evals_hub/utils/utils.py

def backend_factory() -> requests.Session:
    """Configure the HTTP backend for requests to disable SSL verification."""
    session = requests.Session()
    session.verify = False
    return session

`create_repo_from_config(config_path, repo_name)`

Create a Hugging Face dataset repository using configuration settings.

Parameters:

Name	Type	Description	Default
`config_path`	`Path`	Path to the Hugging Face configuration file containing: - org_name (required): Organization name for the repository - resource_group_id (optional): Resource group identifier - private_repository (optional, default True): Whether repo should be private	required
`repo_name`	`str`	Name of the repository to create	required

Returns:

Name	Type	Description
`str`	`str`	The full repository ID (org_name/repo_name)

Raises:

Type	Description
`HTTPError`	If repository creation fails (except for 409 conflicts when repo already exists)

Source code in evals_hub/utils/utils.py

def create_repo_from_config(config_path: Path, repo_name: str) -> str:
    """
    Create a Hugging Face dataset repository using configuration settings.

    Args:
        config_path: Path to the Hugging Face configuration file containing:
            - org_name (required): Organization name for the repository
            - resource_group_id (optional): Resource group identifier
            - private_repository (optional, default True): Whether repo should be private
        repo_name: Name of the repository to create

    Returns:
        str: The full repository ID (org_name/repo_name)

    Raises:
        HTTPError: If repository creation fails (except for 409 conflicts when repo already exists)
    """
    config = load_huggingface_config(config_path)
    # create a dataset repo in huggingface
    org_name = config["org_name"]
    resource_group_id = config.get("resource_group_id", None)
    private_repository = config.get("private_repository", True)

    repo_id = f"{org_name}/{repo_name}"
    try:
        create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            private=private_repository,
            resource_group_id=resource_group_id,
        )
    except HTTPError as err:
        if err.response.status_code == 409:
            logger.error(f"Repo {repo_name} already exists.")
        else:
            raise err

    return repo_id

`get_device()`

Automatically detect and return the best available device.

Returns:

Type	Description
`device`	torch.device: The best available device (cuda, mps, or cpu)

Source code in evals_hub/utils/utils.py

def get_device() -> torch.device:
    """
    Automatically detect and return the best available device.

    Returns:
        torch.device: The best available device (cuda, mps, or cpu)
    """
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

`load_huggingface_config(config_path=None)`

Load Hugging Face configuration from huggingface_config.yaml.

Parameters:

Name	Type	Description	Default
`config_path`	`Path \| None`	Optional path to the config file. If None, uses the default path.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	Configuration containing org_name and resource_group_id

Source code in evals_hub/utils/utils.py

def load_huggingface_config(config_path: Path | None = None) -> dict:
    """
    Load Hugging Face configuration from huggingface_config.yaml.

    Args:
        config_path: Optional path to the config file. If None, uses the default path.

    Returns:
        dict: Configuration containing org_name and resource_group_id
    """
    # Use provided config_path if given, otherwise use default path relative to project root
    if config_path is None:
        config_path = Path(__file__).parents[3] / "cronjob" / "huggingface_config.yaml"

    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    return config

`setup_logging()`

Configure logging for the application.

Source code in evals_hub/utils/utils.py

def setup_logging() -> None:
    """Configure logging for the application."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )