from typing import Literal, Optional
from pydantic import BaseModel, Field
[docs]
class RDF2VecConfig(BaseModel):
"""
Configuration object for GPU-accelerated RDF2Vec.
This dataclass centralizes all hyperparameters controlling:
• walk generation
• vocabulary construction
• Word2Vec model architecture
• training behavior (epochs, batch sizes, reproducibility)
• execution backend (single GPU vs multi-GPU)
• artifact export settings
Parameters
----------
walk_strategy : {"random", "bfs"}, default "random"
Strategy used to generate walks from the knowledge graph.
walk_depth : int, default 4
Maximum depth of each walk.
walk_number : int, default 100
Number of walks started per vertex.
walk_weighted : bool, default False
If True, use edge weights for biased walk transitions via
cuGraph's ``biased_random_walks()``. The input data must contain
a ``"weights"`` column (cuGraph standard name).
embedding_model : {"skipgram", "cbow"}, default "skipgram"
Word2Vec variant used for embedding training.
vector_size : int, default 256
Dimensionality of the output embeddings.
window_size : int, default 5
Context window size for Word2Vec.
min_count : int, default 1
Minimum token frequency for inclusion in the vocabulary.
negative_samples : int, default 5
Number of negative examples for negative sampling.
learning_rate : float, default 0.025
Learning rate used by the optimizer.
epochs : int, default 5
Number of training epochs.
batch_size : int or None, default None
Explicit batch size; if None, Lightning's tuner may pick one.
tune_batch_size : bool, default True
Whether to use PyTorch Lightning’s automatic batch size tuning.
random_state : int, default 42
Seed for reproducible walk sampling and model initialization.
reproducible : bool, default True
If True, enables deterministic modes in PyTorch and CUDA.
multi_gpu : bool, default False
If True, enables multi-GPU walk generation and training using Dask.
cpu_count : int, default 4
Number of CPU workers used.
generate_artifact : bool, default False
If True, persist `word2idx` and embeddings to Parquet files.
num_nodes : int, default 1
Number of nodes involved in multi-GPU setup.
literal_predicates : list[str] or None, default None
Predicates that identify literal (numeric) edges. When set, edges
with these predicates are handled according to ``literal_strategy``.
Predicate strings must match the values in the data exactly.
literal_strategy : {"drop", "bin"}, default "drop"
How to handle literal edges. ``"drop"`` removes them from the graph
(pyRDF2Vec default). ``"bin"`` discretizes the object values into
bin tokens so the edge stays in the graph.
literal_n_bins : int, default 5
Number of bins when ``literal_strategy="bin"``.
literal_bin_strategy : {"quantile", "uniform"}, default "quantile"
Binning method. ``"quantile"`` creates equal-frequency bins (robust
to skew). ``"uniform"`` creates equal-width bins.
"""
# walk parameter settings
walk_strategy: Literal["random", "bfs"] = "random"
walk_depth: int = Field(default=4, gt=0)
walk_number: int = Field(default=100, gt=0)
walk_weighted: bool = False
# embedding parameter settings
embedding_model: Literal["skipgram", "cbow"] = "skipgram"
epochs: int = Field(default=5, gt=0)
batch_size: Optional[int] = Field(default=None, gt=0)
vector_size: int = Field(default=256, gt=0)
window_size: int = Field(default=5, gt=1)
min_count: int = Field(default=1, ge=0)
negative_samples: int = Field(default=5, ge=0)
learning_rate: float = Field(default=0.0001, gt=0)
backend: Literal["pytorch", "gensim"] = "pytorch"
# library settings
random_state: int = Field(default=42, ge=0)
reproducible: bool = False
multi_gpu: bool = False
generate_artifact: bool = False
cpu_count: int = Field(default=4, gt=0)
tune_batch_size: bool = True
num_nodes: int = Field(default=1, gt=0)
tracker: Literal["mlflow", "wandb", "none"] = "none"
tracker_kwargs: Optional[dict] = None
tracker_run_name: Optional[str] = None
# literal handling settings
literal_predicates: Optional[list[str]] = None
literal_strategy: Literal["drop", "bin"] = "drop"
literal_n_bins: int = Field(default=5, gt=1)
literal_bin_strategy: Literal["quantile", "uniform"] = "quantile"