Source code for rdf2vecgpu.config

from typing import Literal, Optional
from pydantic import BaseModel, Field



[docs]
class RDF2VecConfig(BaseModel):
    """
    Configuration object for GPU-accelerated RDF2Vec.

    This dataclass centralizes all hyperparameters controlling:
       • walk generation
       • vocabulary construction
       • Word2Vec model architecture
       • training behavior (epochs, batch sizes, reproducibility)
       • execution backend (single GPU vs multi-GPU)
       • artifact export settings

    Parameters
    ----------
    walk_strategy : {"random", "bfs"}, default "random"
        Strategy used to generate walks from the knowledge graph.

    walk_depth : int, default 4
        Maximum depth of each walk.

    walk_number : int, default 100
        Number of walks started per vertex.

    walk_weighted : bool, default False
        If True, use edge weights for biased walk transitions via
        cuGraph's ``biased_random_walks()``. The input data must contain
        a ``"weights"`` column (cuGraph standard name).

    embedding_model : {"skipgram", "cbow"}, default "skipgram"
        Word2Vec variant used for embedding training.

    vector_size : int, default 256
        Dimensionality of the output embeddings.

    window_size : int, default 5
        Context window size for Word2Vec.

    min_count : int, default 1
        Minimum token frequency for inclusion in the vocabulary.

    negative_samples : int, default 5
        Number of negative examples for negative sampling.

    learning_rate : float, default 0.025
        Learning rate used by the optimizer.

    epochs : int, default 5
        Number of training epochs.

    batch_size : int or None, default None
        Explicit batch size; if None, Lightning's tuner may pick one.

    tune_batch_size : bool, default True
        Whether to use PyTorch Lightning’s automatic batch size tuning.

    random_state : int, default 42
        Seed for reproducible walk sampling and model initialization.

    reproducible : bool, default True
        If True, enables deterministic modes in PyTorch and CUDA.

    multi_gpu : bool, default False
        If True, enables multi-GPU walk generation and training using Dask.

    cpu_count : int, default 4
        Number of CPU workers used.

    generate_artifact : bool, default False
        If True, persist `word2idx` and embeddings to Parquet files.

    num_nodes : int, default 1
        Number of nodes involved in multi-GPU setup.

    literal_predicates : list[str] or None, default None
        Predicates that identify literal (numeric) edges. When set, edges
        with these predicates are handled according to ``literal_strategy``.
        Predicate strings must match the values in the data exactly.

    literal_strategy : {"drop", "bin"}, default "drop"
        How to handle literal edges. ``"drop"`` removes them from the graph
        (pyRDF2Vec default). ``"bin"`` discretizes the object values into
        bin tokens so the edge stays in the graph.

    literal_n_bins : int, default 5
        Number of bins when ``literal_strategy="bin"``.

    literal_bin_strategy : {"quantile", "uniform"}, default "quantile"
        Binning method. ``"quantile"`` creates equal-frequency bins (robust
        to skew). ``"uniform"`` creates equal-width bins.
    """

    # walk parameter settings
    walk_strategy: Literal["random", "bfs"] = "random"
    walk_depth: int = Field(default=4, gt=0)
    walk_number: int = Field(default=100, gt=0)
    walk_weighted: bool = False
    # embedding parameter settings
    embedding_model: Literal["skipgram", "cbow"] = "skipgram"
    epochs: int = Field(default=5, gt=0)
    batch_size: Optional[int] = Field(default=None, gt=0)
    vector_size: int = Field(default=256, gt=0)
    window_size: int = Field(default=5, gt=1)
    min_count: int = Field(default=1, ge=0)
    negative_samples: int = Field(default=5, ge=0)
    learning_rate: float = Field(default=0.0001, gt=0)
    backend: Literal["pytorch", "gensim"] = "pytorch"
    # library settings
    random_state: int = Field(default=42, ge=0)
    reproducible: bool = False
    multi_gpu: bool = False
    generate_artifact: bool = False
    cpu_count: int = Field(default=4, gt=0)
    tune_batch_size: bool = True
    num_nodes: int = Field(default=1, gt=0)
    tracker: Literal["mlflow", "wandb", "none"] = "none"
    tracker_kwargs: Optional[dict] = None
    tracker_run_name: Optional[str] = None
    # literal handling settings
    literal_predicates: Optional[list[str]] = None
    literal_strategy: Literal["drop", "bin"] = "drop"
    literal_n_bins: int = Field(default=5, gt=1)
    literal_bin_strategy: Literal["quantile", "uniform"] = "quantile"