Skip to content

Commit 0e19ee4

Browse files
authored
fix: make CE entity generation deterministic when seeded (#117)
- rework RNG plumbing across person, insurance, finance, public-sector generators so seeded runs replay deterministically while unseeded runs stay random - expand CE datasets (countries, ecommerce currencies, product vocab, shipping/payment tables) for ES/FR/GB coverage and align integration descriptors with new data - rebuild CE integration suite around local descriptors (no EE references) and add reproducibility assertions with rngSeed fixtures - add regression tests ensuring unseeded generators still emit varied data and “other” gender nobility titles remain strings - clean up Faker RNG wiring, bank currency sampling, and adapter services to satisfy mypy/ruff and keep tasks deterministic - document WHY blocks in descriptors/scripts for future maintainers and seed ledger helper for deterministic finance scripts - verified with `uv run pytest -q tests_ce/integration_tests/test_entity`, `uv run pytest tests_ce/unit_tests/test_generator/test_person_generator_randomness.py`, and `mypy datamimic_ce`
1 parent 59896f2 commit 0e19ee4

File tree

182 files changed

+3781
-429
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+3781
-429
lines changed

datamimic_ce/domains/common/generators/person_generator.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,19 +57,55 @@ def __init__(
5757
and resolved_config.age_min > resolved_config.age_max
5858
):
5959
raise ValueError("age_min cannot be greater than age_max")
60-
self._gender_generator = GenderGenerator(female_quota=female_quota, other_gender_quota=other_gender_quota)
61-
self._given_name_generator = GivenNameGenerator(dataset=self._dataset)
62-
self._family_name_generator = FamilyNameGenerator(dataset=self._dataset)
63-
self._email_generator = EmailAddressGenerator(dataset=self._dataset)
64-
self._phone_generator = PhoneNumberGenerator(dataset=self._dataset)
65-
self._address_generator = AddressGenerator(dataset=self._dataset)
60+
# Fan out deterministic RNG copies so rngSeed descriptors yield reproducible composite attributes.
61+
self._gender_generator = GenderGenerator(
62+
female_quota=female_quota,
63+
other_gender_quota=other_gender_quota,
64+
rng=self._derive_rng() if rng is not None else None,
65+
)
66+
self._given_name_generator = GivenNameGenerator(
67+
dataset=self._dataset,
68+
rng=self._derive_rng() if rng is not None else None,
69+
)
70+
self._family_name_generator = FamilyNameGenerator(
71+
dataset=self._dataset,
72+
rng=self._derive_rng() if rng is not None else None,
73+
)
74+
self._email_generator = EmailAddressGenerator(
75+
dataset=self._dataset,
76+
rng=self._derive_rng() if rng is not None else None,
77+
)
78+
self._phone_generator = PhoneNumberGenerator(
79+
dataset=self._dataset,
80+
rng=self._derive_rng() if rng is not None else None,
81+
)
82+
self._address_generator = AddressGenerator(
83+
dataset=self._dataset,
84+
rng=self._derive_rng() if rng is not None else None,
85+
)
6686
self._demographic_config = resolved_config
6787
birth_min = self._demographic_config.age_min if self._demographic_config.age_min is not None else min_age
6888
birth_max = self._demographic_config.age_max if self._demographic_config.age_max is not None else max_age
6989
# Clamp birthdate sampling to caller-provided bounds without scattering defaults.
70-
self._birthdate_generator = BirthdateGenerator(min_age=birth_min, max_age=birth_max)
71-
self._academic_title_generator = AcademicTitleGenerator(dataset=self._dataset, quota=academic_title_quota)
72-
self._nobility_title_generator = NobilityTitleGenerator(dataset=self._dataset, noble_quota=noble_quota)
90+
self._birthdate_generator = BirthdateGenerator(
91+
min_age=birth_min,
92+
max_age=birth_max,
93+
rng=self._derive_rng() if rng is not None else None,
94+
)
95+
self._academic_title_generator = AcademicTitleGenerator(
96+
dataset=self._dataset,
97+
quota=academic_title_quota,
98+
rng=self._derive_rng() if rng is not None else None,
99+
)
100+
self._nobility_title_generator = NobilityTitleGenerator(
101+
dataset=self._dataset,
102+
noble_quota=noble_quota,
103+
rng=self._derive_rng() if rng is not None else None,
104+
)
105+
106+
def _derive_rng(self) -> Random:
107+
# Spawn child RNGs from the base seed so seeded descriptors replay without entangling independent draws.
108+
return Random(self._rng.randrange(2**63)) if isinstance(self._rng, Random) else Random()
73109

74110
@property
75111
def gender_generator(self) -> GenderGenerator:

datamimic_ce/domains/common/literal_generators/birthdate_generator.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# For questions and support, contact: info@rapiddweller.com
66

77
from datetime import datetime, timedelta
8+
from random import Random
89

910
from datamimic_ce.domains.common.literal_generators.datetime_generator import DateTimeGenerator
1011
from datamimic_ce.domains.domain_core.base_literal_generator import BaseLiteralGenerator
@@ -31,6 +32,7 @@ def __init__(
3132
self,
3233
min_age: int = 1,
3334
max_age: int = 100,
35+
rng: Random | None = None,
3436
) -> None:
3537
"""
3638
Parameters:
@@ -48,8 +50,14 @@ def __init__(
4850
today = datetime(today.year, 2, 28)
4951
self._min_birthdate = datetime(today.year - max_age - 1, today.month, today.day) + timedelta(days=1)
5052
self._max_birthdate = datetime(today.year - min_age, today.month, today.day)
53+
base_rng = rng or Random()
54+
# Derive a dedicated RNG for date sampling so seeded runs stay reproducible without cross-coupling streams.
55+
date_rng = Random(base_rng.randrange(2**63)) if rng is not None else Random()
5156
self._date_generator = DateTimeGenerator(
52-
min=str(self._min_birthdate), max=str(self._max_birthdate), random=True
57+
min=str(self._min_birthdate),
58+
max=str(self._max_birthdate),
59+
random=True,
60+
rng=date_rng,
5361
)
5462

5563
def generate(self) -> datetime:

datamimic_ce/domains/common/literal_generators/data_faker_generator.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@
44
# See LICENSE file for the full text of the license.
55
# For questions and support, contact: info@rapiddweller.com
66

7-
from typing import Any
7+
import random
8+
from typing import Any, Protocol, cast
89

910
from faker import Faker
1011

1112
from datamimic_ce.domains.domain_core.base_literal_generator import BaseLiteralGenerator
1213
from datamimic_ce.enums.faker_enums import UnsupportedMethod
1314

1415

16+
class _SupportsRandom(Protocol):
17+
random: random.Random
18+
19+
1520
class DataFakerGenerator(BaseLiteralGenerator):
1621
"""
1722
This is an implement of Python Faker
@@ -23,12 +28,17 @@ def __init__(
2328
method: str,
2429
locale: str | None = "en_US",
2530
*args,
31+
rng: random.Random | None = None,
2632
**kwargs,
2733
) -> None:
2834
# validation support methods
2935
if method in UnsupportedMethod._value2member_map_ or method.startswith("_"):
3036
raise ValueError(f"Faker method '{method}' is not supported")
3137
self._faker = Faker(locale)
38+
if rng is not None:
39+
# WHY: faker.Faker exposes a dynamic `random` attribute; cast to a protocol so mypy accepts the assignment.
40+
faker_with_random = cast(_SupportsRandom, self._faker)
41+
faker_with_random.random = rng
3242
self._method = method
3343
self._locale = locale
3444
self._args = args

datamimic_ce/domains/common/literal_generators/datetime_generator.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,10 @@ def __init__(
4444
minute_granularity: int | None = None,
4545
second_granularity: int | None = None,
4646
seed: int | None = None,
47+
rng: _random.Random | None = None,
4748
):
48-
# private RNG for determinism
49-
self._rng = _random.Random(seed)
49+
# Allow callers to inject their rng so rngSeed descriptors replay deterministically.
50+
self._rng = rng or _random.Random(seed)
5051

5152
# format and weights
5253
self._input_format = input_format if input_format else "%Y-%m-%d %H:%M:%S"

datamimic_ce/domains/common/literal_generators/domain_generator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ def __init__(self, dataset: str | None = None, rng: random.Random | None = None)
2727
self._tld_dataset = FileUtil.read_wgt_file(tld_path)
2828

2929
self._company_name: str | None = None
30-
self._company_name_generator = CompanyNameGenerator()
30+
# Share the deterministic RNG so seeded email/domain combos remain reproducible end-to-end.
31+
self._company_name_generator = CompanyNameGenerator(rng=self._rng)
3132

3233
def generate(self) -> str:
3334
"""

datamimic_ce/domains/common/literal_generators/email_address_generator.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,25 @@ def __init__(
2424
dataset: str | None = None,
2525
given_name: str | None = None,
2626
family_name: str | None = None,
27+
rng: random.Random | None = None,
2728
):
2829
self._dataset = (dataset or "US").upper() # align downstream generators with ISO-based datasets
29-
self._rng: random.Random = random.Random()
30+
self._rng: random.Random = rng or random.Random()
31+
32+
def _derive_rng() -> random.Random:
33+
# Split deterministic streams so rngSeed descriptors do not couple email joins with domain picks.
34+
return random.Random(self._rng.randrange(2**63)) if rng is not None else random.Random()
35+
3036
self._given_name = given_name
31-
self._given_name_generator = GivenNameGenerator(dataset=self._dataset) if given_name is None else None
37+
self._given_name_generator = (
38+
GivenNameGenerator(dataset=self._dataset, rng=_derive_rng()) if given_name is None else None
39+
)
3240
self._family_name = family_name
33-
self._family_name_generator = FamilyNameGenerator(dataset=self._dataset) if family_name is None else None
41+
self._family_name_generator = (
42+
FamilyNameGenerator(dataset=self._dataset, rng=_derive_rng()) if family_name is None else None
43+
)
3444
self._company_name: str | None = None
35-
self._domain_generator = DomainGenerator(dataset=self._dataset)
45+
self._domain_generator = DomainGenerator(dataset=self._dataset, rng=_derive_rng())
3646

3747
def generate(self) -> str:
3848
"""

datamimic_ce/domains/common/literal_generators/nobility_title_generator.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -43,31 +43,43 @@ def __init__(
4343
self._male_values, self._male_weights = FileUtil.read_wgt_file(file_path=male_file_path)
4444
self._female_values, self._female_weights = FileUtil.read_wgt_file(file_path=female_file_path)
4545

46-
def generate(self) -> str | None:
46+
def generate(self) -> str:
4747
"""
4848
Generate random nobility title
4949
Returns:
5050
Optional[str]: Returns a string if successful, otherwise returns None.
5151
"""
5252
if self._gender in ["male", "female", "other"]:
5353
return self.generate_with_gender(self._gender)
54-
else:
55-
return ""
54+
return ""
5655

57-
def generate_with_gender(self, gender: str):
56+
def generate_with_gender(self, gender: str) -> str:
5857
"""
5958
Generate random nobility title
6059
Returns:
6160
Optional[str]: Returns a string if successful, otherwise returns None.
6261
"""
63-
if self._rng.random() < self._noble_quota:
64-
if gender == "male":
65-
return self._male_values and self._rng.choices(self._male_values, self._male_weights, k=1)[0] or None
66-
elif gender == "female":
67-
return (
68-
self._female_values and self._rng.choices(self._female_values, self._female_weights, k=1)[0] or None
69-
)
70-
else:
71-
""""""
62+
if self._rng.random() >= self._noble_quota:
63+
return ""
64+
65+
values: list[str]
66+
weights: list[float]
67+
if gender == "male":
68+
values = list(self._male_values)
69+
weights = list(self._male_weights)
70+
elif gender == "female":
71+
values = list(self._female_values)
72+
weights = list(self._female_weights)
7273
else:
74+
# WHY: Merge available titles for non-binary genders instead of returning None when quota triggers.
75+
values = list(self._male_values) + list(self._female_values)
76+
weights = list(self._male_weights) + list(self._female_weights)
77+
78+
if not values:
7379
return ""
80+
81+
# Align weight vector with values; default to uniform if source data is missing.
82+
if not weights or len(weights) != len(values):
83+
weights = [1.0 for _ in values]
84+
85+
return self._rng.choices(values, weights=weights, k=1)[0]

datamimic_ce/domains/common/models/person.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def given_name(self) -> str:
4949
Returns:
5050
The first name of the person.
5151
"""
52-
return self._person_generator.given_name_generator.generate()
52+
return self._person_generator.given_name_generator.generate_with_gender(self.gender)
5353

5454
@given_name.setter
5555
def given_name(self, value: str) -> None:
@@ -195,7 +195,7 @@ def academic_title(self) -> str | None:
195195
@property_cache
196196
def nobility_title(self) -> str | None:
197197
"""Get the nobility title of the person."""
198-
return self._person_generator.nobility_title_generator.generate()
198+
return self._person_generator.nobility_title_generator.generate_with_gender(self.gender)
199199

200200
@property
201201
@property_cache

datamimic_ce/domains/common/services/person_service.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ def __init__(
2828
other_gender_quota: float = 0.0,
2929
demographic_config: DemographicConfig | None = None,
3030
rng: Random | None = None,
31+
noble_quota: float = 0.001,
32+
academic_title_quota: float = 0.5,
3133
):
3234
resolved_config = (demographic_config or DemographicConfig()).with_defaults(
3335
default_age_min=min_age,
@@ -45,6 +47,9 @@ def __init__(
4547
other_gender_quota=other_gender_quota,
4648
demographic_config=resolved_config,
4749
rng=rng,
50+
# Thread descriptor-level overrides for noble/title quotas into the generator for determinism.
51+
noble_quota=noble_quota,
52+
academic_title_quota=academic_title_quota,
4853
),
4954
Person,
5055
)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
ES,es_ES,34,6[0-9],España,47500000,comunidad autónoma,provincia,municipio
2+
AD,ca_AD,376,6[0-9],Andorra,79000,parròquia,,parròquia
3+
GI,en_GI,350,5[6-9][0-9],Gibraltar,33700,district,,town
4+
EA,es_EA,34,6[0-9],Ceuta y Melilla,170000,ciudad autónoma,barriada,barrio

0 commit comments

Comments
 (0)