entity-resolution-engine-basic/src/config/resolver.yaml at develop · OP-TED/entity-resolution-engine-basic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Entity Resolver configuration — Extended blocking (address fields)
# Supports Jaro-Winkler similarity with geographic granularity via NUTS, PostCode, PostName, Thoroughfare.
#
# Entity fields: names must match fields in rdf_mapping.yaml entity_types.ORGANISATION.fields
entity_fields:
  - legal_name
  - country_code
  - nuts_code
  - post_code
  - post_name
  - thoroughfare

# DuckDB database configuration
duckdb:
  # type: in-memory  # Options: "in-memory" or "persistent"
  type: persistent
#  path: data/app.duckdb  # Default path for persistent mode (overridden by DUCKDB_PATH env var)

cache_strategy: tf_incremental

# Cluster assignment threshold: requires match_probability >= threshold for cluster join.
# Address-enriched model is expected to be more confident; starting at 0.20.
# Adjust downward if precision is too low; upward if recall is insufficient.
threshold: 0.20

# Maximum cluster references returned per resolve_request() call.
top_n: 100

# Lower bound on match weight passed to find_matches_to_new_records().
# -10 captures below-threshold links needed for full candidate output.
match_weight_threshold: -10

# Automatic training threshold: trigger non-blocking EM at N mentions.
auto_train_threshold: 50

splink:
  # Prior: Fellegi-Sunter λ (probability any two records match).
  # With address fields, expect slightly higher match rate (finer granularity helps).
  # Using 0.003 (vs 0.0022 for name-only) to account for address signal.
  probability_two_random_records_match: 0.003

  # Comparisons: identity functions and similarity scoring for pairwise evaluation.
  # NOTE: country_code used only in blocking rules, not comparisons (to preserve EM training).
  # Address fields complement legal_name matching for disambiguation and confidence.
  comparisons:

    # Primary identifier: legal name with Jaro-Winkler thresholds
    # Unchanged from baseline: 0.9 (very high similarity), 0.8 (quite similar), else (low)
    - type: jaro_winkler
      field: legal_name
      thresholds: [0.9, 0.8]

    # Supporting signals (lower confidence than legal name, but disambiguate)

    # NUTS code (Nomenclature of Territorial Units for Statistics)
    # Exact match only: same NUTS = same region, no NUTS = missing data.
    # Most EU organizations in procurement have NUTS; non-EU lack it.
    # Confidence: Match implies same country+region (but not unique identifier).
    - type: exact_match
      field: nuts_code

    # Post Code (postal / ZIP code)
    # Jaro-Winkler with high thresholds for typo tolerance
    - type: jaro_winkler
      field: post_code
      thresholds: [0.95, 0.85]

    # Post Name (city/locality name, e.g., "Frankfurt am Main")
    # Jaro-Winkler at [0.90, 0.80]: captures abbreviations, accents, spelling variations
    # 0.90: high confidence (e.g., "München" vs "Munich" should match, or with accents)
    # 0.80: moderate confidence (e.g., "St. John's" vs "St Johns", abbreviation variations)
    # Caveat: multiple companies in same city; not alone sufficient for match.
    - type: jaro_winkler
      field: post_name
      thresholds: [0.90, 0.80]

    # Thoroughfare (street address: road name + house number)
    # Jaro-Winkler at [0.95, 0.85]: very high threshold due to specificity
    # 0.95: near-identical streets (captures digit transpositions: "4 Main" vs "5 Main")
    # 0.85: captures common abbreviations ("Street" vs "St.", "Avenue" vs "Ave")
    # Rationale: Street addresses are highly specific; typos are unusual but possible.
    #            Organizations may move offices, so don't rely on this alone.
    - type: jaro_winkler
      field: thoroughfare
      thresholds: [0.95, 0.85]

    # Country code: exact match only (baseline blocking rule support).
    # NOTE: Used in blocking only; comparison is a no-op (all same-country pairs).
    - type: exact_match
      field: country_code

  # Blocking rules: pairs are compared only if at least ONE rule fires.
  # Expressed as field names; multi-field rules use a list.
  #
  # Design: country-level primary blocking, NUTS-level secondary for EU.
  blocking_rules:
    # Primary: country code (strict rule: must match country)
    - country_code

    # Secondary (EU-specific): country + NUTS code blocking for finer granularity
    # Enables disambiguation of large countries (e.g., Germany's 290+ regions).
    # Falls back gracefully: if NUTS missing, country-only rule fires.
    # Only applies when both records have NUTS (common for EU procurement data).
    - [country_code, nuts_code]

  # Cold-start default m/u probabilities (used before EM training).
  # Each comparison field gets distributions for each similarity level.
  # Once EM training completes, trained parameters overwrite these.
  cold_start:
    comparisons:

      legal_name:
        # JaroWinkler [0.9, 0.8]: high / medium / low similarity
        # m_prob: likelihood of match at each similarity tier (empirically tuned)
        # u_prob: likelihood in random records (opposite population)
        m_probabilities: [0.9, 0.6, 0.025, 0.005]
        u_probabilities: [0.00001, 0.0004, 0.004, 0.99559]

      country_code:
        # ExactMatch: match / no-match
        # Near-deterministic: matching country codes strongly imply same country.
        m_probabilities: [0.99, 0.01]
        u_probabilities: [0.10, 0.90]

      nuts_code:
        # ExactMatch: match / no-match
        # Strong signal: same NUTS region = same EU administrative region.
        # However, some organizations have operations across multiple NUTS (parent + branches).
        # m_prob=0.92: "likely same location" but not identity.
        # u_prob=0.08: in random records, small chance of NUTS collision across large geographic areas.
        m_probabilities: [0.92, 0.08]
        u_probabilities: [0.05, 0.95]

      post_code:
        # Jaro-Winkler [0.95, 0.85]: very high / high / low similarity
        # m_prob: 0.85 (95% match), 0.40 (85% match), 0.02 (low)
        #   - 0.95 JW: nearly identical postal codes
        #   - 0.85 JW: postal codes with minor variations (digit transposition, typo)
        #   - else: different postal zone or missing data
        # u_prob: 0.02 (95% match - rare collision), 0.08 (85% match), 0.90 (low)
        m_probabilities: [0.85, 0.40, 0.02, 0.005]
        u_probabilities: [0.02, 0.08, 0.08, 0.82]

      post_name:
        # JaroWinkler [0.90, 0.80]: high / moderate / low similarity
        # Moderate confidence: city names are less unique than addresses.
        # Multiple organizations can be in same city.
        # m_prob: 0.65 (90% match), 0.25 (80% match), 0.01 (low)
        # u_prob: 0.05 (90% match), 0.15 (80% match), 0.80 (low)
        m_probabilities: [0.65, 0.25, 0.01, 0.005]
        u_probabilities: [0.05, 0.15, 0.15, 0.65]

      thoroughfare:
        # JaroWinkler [0.95, 0.85]: very high / high / low similarity
        # Very strong signal: street addresses are highly specific.
        # Same street + building = likely same organization (or landlord info).
        # m_prob: 0.85 (95% match), 0.50 (85% match), 0.02 (low)
        #   - 95% JW match: almost certainly same office location
        #   - 85% JW match: likely same street but possibly different building/unit
        #   - low: different address or missing data
        # u_prob: 0.01 (95% match - street collision rare), 0.08 (85% match), 0.91 (low)
        m_probabilities: [0.85, 0.50, 0.02, 0.005]
        u_probabilities: [0.01, 0.08, 0.08, 0.83]