fix(migration): clean address_line + postal_code + connection_type at import

Three legacy data-quality issues that were leaking into ERPNext on every
import run. Caught while auditing C-LPB4's mis-pinned dispatch job.

1. **Postal code embedded in address_line.** Legacy `gestionclient` had
   rows like `2200-3 chemin de la riviere de la guerre  J0S1B0` with
   the postal code concatenated at the end (and the same code repeated
   in the dedicated zip column). Caused 48-char address_line on what
   should have been a 39-char address. Now stripped at import: a regex
   matches `\\s+<FSA><LDU>\\s*$` (with or without space) and removes
   it; the dedicated postal_code field carries the canonical form.

2. **Abbreviations + Cobol-style capitalization.** Legacy stored
   `2066 Ch De La 1Re-Concession` instead of the canonical
   `2066 Chemin de la 1re-Concession`. ABBREV_MAP expands `Ch` →
   `Chemin`, `Av` → `Avenue`, `Bd`/`Boul` → `Boulevard`, `Rte` →
   `Route`, `St-` → `Saint-`, `Ste-` → `Sainte-`, `Mtl` → `Montréal`.
   Title-casing rule preserves French articles lowercase (`de`, `du`,
   `des`, `la`, `le`, `les`, `au`, `aux`, `à`, `et`, `sur`, `en`)
   and ordinal markers (`1re`, `2e`, `3e`). 96 SLs in production had
   the `1Re-Concession` style; they'll be re-normalized on next
   migration run.

3. **`connection_type` left empty even when ONT/CPE devices existed.**
   Pre-loads device→delivery mapping at import start; if the legacy
   delivery has any device whose category/name/model contains "ont",
   "onu", "cpe", "fibre", "gpon", or "ftth", we set
   connection_type='Fibre FTTH'. Without devices on file, the field
   stays empty (rep fills it later) — we don't guess.

4. **`postal_code` normalized too** — `j0s1b0` → `J0S 1B0` (uppercase
   + canonical space). Was being inserted in lowercase no-space form.

Self-tested on 8 representative cases including the actual broken
records found in production (LOC-15903, LOC-6227, LOC-4 / C-LPB4).

These changes affect only re-imports of locations. Existing data
needs a separate backfill script — a follow-up will cover that
either as a one-shot migration or by running the existing
`reimport_subscriptions.py` after this script.
This commit is contained in:
louispaulb 2026-05-08 15:38:19 -04:00
parent 1186e50bbe
commit 10afd696ae

View File

@ -57,6 +57,135 @@ def clean(val):
return ""
return unescape(str(val)).strip()
# ── Address normalization helpers ─────────────────────────────────────────
# These exist because the legacy `gestionclient` data is full of:
# - postal codes embedded in address_line (e.g. "12 rue X J0S1B0")
# - abbreviations ("Ch" instead of "Chemin", "Av." for "Avenue")
# - Cobol-style capitalization ("1Re-Concession" instead of "1re-Concession")
# - lowercase or no-space postal codes ("j0s1b0" vs "J0S 1B0")
# Without normalization these break the RQA address validator (no exact
# match), Mapbox geocoder fuzzy results, and the human eye on customer
# cards. We catch them at the import step so re-runs converge on the
# canonical form.
import re
# Common French / Quebec street-type abbreviations seen in legacy data.
# Order matters: longer keys first so "Bd." matches before "B."
# Pattern intent: match the abbreviation as a full word ("\b...\b"),
# THEN optionally swallow a trailing period. Without the trailing
# word-boundary inside, "Boul." would match only "Boul" and leave
# the dot behind ("Boulevard."). The order is the dot AFTER the
# inner \b so the whole-word check happens first.
ABBREV_MAP = [
(r'\bCh\b\.?', 'Chemin'),
(r'\bRte\b\.?', 'Route'),
(r'\bAv\b\.?', 'Avenue'),
(r'\bBd\b\.?', 'Boulevard'),
(r'\bBoul\b\.?', 'Boulevard'),
(r'\bSt[\.-]', 'Saint-'),
(r'\bSte[\.-]', 'Sainte-'),
(r'\bMtl\b\.?', 'Montréal'),
]
# Words that should stay lowercase even when title-casing the address.
# RQA standard puts ordinal markers ("1re", "2e", "3e") in lowercase.
LOWER_WORDS = {
'de', 'du', 'des', 'la', 'le', 'les', 'l\'', 'd\'',
'au', 'aux', 'à', # Rivière-aux-Outardes, Pointe-au-Chêne
'et', 'sur', # Saint-Pierre-et-Miquelon, Bois-sur-Rivière
'en', # Cap-en-Haut
}
ORDINAL_RE = re.compile(r'^(\d+)(re|er|e|ere|eme|ème)$', re.I)
def normalize_postal_code(pc):
"""Uppercase + strip + insert canonical space: 'j0s1b0''J0S 1B0'."""
if not pc:
return None
s = re.sub(r'\s+', '', str(pc)).upper()
if not re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', s):
return s or None # malformed; pass through so the rep sees it
return s[:3] + ' ' + s[3:]
def clean_address_line(raw, postal_code=None):
"""Normalize a legacy address_line for ingestion into Service Location.
1. Strip the postal code if it leaked into the end of the field
(legacy bug that caused 48-char address_line on LPB4's neighbour).
2. Expand common Quebec street-type abbreviations.
3. Title-case words but keep articles ('de', 'la', 'des') lowercase
and ordinal markers ('1re', '2e') lowercase too.
4. Collapse runs of whitespace into single spaces.
"""
if not raw:
return ""
s = unescape(str(raw)).strip()
# 1. Strip embedded postal code (with or without space)
pc_re = re.compile(r'\s+[A-Z]\d[A-Z]\s?\d[A-Z]\d\s*$', re.I)
s = pc_re.sub('', s).strip()
# Sometimes the postal code from the dedicated field is doubled
if postal_code:
pc_clean = re.sub(r'\s+', '', postal_code).upper()
s = re.sub(re.escape(pc_clean) + r'\s*$', '', s, flags=re.I).strip()
# 2. Expand abbreviations
for pat, repl in ABBREV_MAP:
s = re.sub(pat, repl, s, flags=re.I)
# 3. Word-by-word title casing with exceptions
parts = []
for i, word in enumerate(s.split(' ')):
if not word:
continue
lw = word.lower()
# Hyphenated chunks: title-case each piece (and apply ordinal rule)
if '-' in word:
sub = []
for chunk in word.split('-'):
cl = chunk.lower()
m = ORDINAL_RE.match(cl)
if m:
sub.append(m.group(1) + m.group(2).lower())
elif cl in LOWER_WORDS:
sub.append(cl)
else:
sub.append(chunk[:1].upper() + chunk[1:].lower())
parts.append('-'.join(sub))
continue
# Articles stay lowercase except as the first word
if i > 0 and lw in LOWER_WORDS:
parts.append(lw)
continue
# Ordinal markers like "1re", "2e" lowercase
m = ORDINAL_RE.match(lw)
if m:
parts.append(m.group(1) + m.group(2).lower())
continue
parts.append(word[:1].upper() + word[1:].lower())
# 4. Collapse double spaces
out = re.sub(r'\s+', ' ', ' '.join(parts)).strip()
return out
def detect_connection_type(devices_for_delivery):
"""If the legacy delivery has a fibre device (ONT/CPE/ONU), the SL is on
fibre. Without devices we leave it empty the rep will fill in later.
"""
if not devices_for_delivery:
return None
for d in devices_for_delivery:
cat = clean(d.get('category', '')).lower()
nm = clean(d.get('name', '')).lower()
mdl = clean(d.get('model', '')).lower()
combined = ' '.join([cat, nm, mdl])
if any(k in combined for k in ('ont', 'onu', 'cpe', 'fibre', 'gpon', 'ftth')):
return 'Fibre FTTH'
return None
def log(msg):
print("[{}] {}".format(datetime.now(timezone.utc).strftime("%H:%M:%S"), msg), flush=True)
@ -157,6 +286,17 @@ def main():
existing_loc = set(r[0] for r in pgc.fetchall())
log(" {} already imported".format(len(existing_loc)))
# Pre-load device → delivery_id mapping from legacy so we can detect
# fibre availability per Service Location at insert time. Keyed by
# delivery_id, value = list of devices on that delivery.
cur.execute("SELECT delivery_id, category, name, model FROM device WHERE delivery_id IS NOT NULL")
devices_by_delivery = {}
for row in cur.fetchall():
did_dev = row.get('delivery_id')
if did_dev:
devices_by_delivery.setdefault(did_dev, []).append(row)
log(" {} deliveries have at least one device".format(len(devices_by_delivery)))
# delivery_id → Service Location name mapping (for phases 3-5)
del_map = {}
loc_ok = loc_skip = loc_err = 0
@ -174,8 +314,19 @@ def main():
loc_err += 1
continue
addr = clean(d.get("address1"))
city = clean(d.get("city"))
# ── Normalize address fields BEFORE insert ──
# postal_code: "j0s1b0" → "J0S 1B0" (canonical form with space).
# address_line: strip embedded postal codes, expand "Ch."→"Chemin"
# etc., title-case with French article rules. The migrated dataset
# has 96+ "1Re-Concession"-style entries that the RQA validator
# can't match without this pass.
postal_norm = normalize_postal_code(clean(d.get("zip")))
addr = clean_address_line(d.get("address1"), postal_norm)
city_raw = clean(d.get("city"))
# City: title-case with the same article rules used for road names
# ("Saint-Louis-de-Gonzague" stays correct, "saint-michel" gets
# capitalized to "Saint-Michel").
city = clean_address_line(city_raw) if city_raw else ""
loc_name_display = clean(d.get("name")) or "{}, {}".format(addr, city) if addr else "Location-{}".format(did)
loc_id = uid("LOC-")
@ -190,6 +341,11 @@ def main():
except (ValueError, TypeError):
pass
# Detect fibre availability from the devices we pre-loaded.
# Without devices we leave connection_type empty rather than
# guessing — the rep fills it later.
conn_type = detect_connection_type(devices_by_delivery.get(did, []))
try:
pgc.execute("""
INSERT INTO "tabService Location" (
@ -198,6 +354,7 @@ def main():
address_line, city, postal_code, province,
latitude, longitude,
contact_name, contact_phone,
connection_type,
legacy_delivery_id
) VALUES (
%s, %s, %s, %s, %s, 0, 0,
@ -205,16 +362,18 @@ def main():
%s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
%s
)
""", (loc_id, now, now, ADMIN, ADMIN,
cust_id, loc_name_display[:140],
addr or "N/A", city or "N/A",
clean(d.get("zip")) or None,
postal_norm,
clean(d.get("state")) or "QC",
lat, lon,
clean(d.get("contact")) or None,
clean(d.get("tel_home")) or clean(d.get("cell")) or None,
conn_type,
did))
del_map[did] = loc_id