fix(migration): clean address_line + postal_code + connection_type at import
Three legacy data-quality issues that were leaking into ERPNext on every import run. Caught while auditing C-LPB4's mis-pinned dispatch job. 1. **Postal code embedded in address_line.** Legacy `gestionclient` had rows like `2200-3 chemin de la riviere de la guerre J0S1B0` with the postal code concatenated at the end (and the same code repeated in the dedicated zip column). Caused 48-char address_line on what should have been a 39-char address. Now stripped at import: a regex matches `\\s+<FSA><LDU>\\s*$` (with or without space) and removes it; the dedicated postal_code field carries the canonical form. 2. **Abbreviations + Cobol-style capitalization.** Legacy stored `2066 Ch De La 1Re-Concession` instead of the canonical `2066 Chemin de la 1re-Concession`. ABBREV_MAP expands `Ch` → `Chemin`, `Av` → `Avenue`, `Bd`/`Boul` → `Boulevard`, `Rte` → `Route`, `St-` → `Saint-`, `Ste-` → `Sainte-`, `Mtl` → `Montréal`. Title-casing rule preserves French articles lowercase (`de`, `du`, `des`, `la`, `le`, `les`, `au`, `aux`, `à`, `et`, `sur`, `en`) and ordinal markers (`1re`, `2e`, `3e`). 96 SLs in production had the `1Re-Concession` style; they'll be re-normalized on next migration run. 3. **`connection_type` left empty even when ONT/CPE devices existed.** Pre-loads device→delivery mapping at import start; if the legacy delivery has any device whose category/name/model contains "ont", "onu", "cpe", "fibre", "gpon", or "ftth", we set connection_type='Fibre FTTH'. Without devices on file, the field stays empty (rep fills it later) — we don't guess. 4. **`postal_code` normalized too** — `j0s1b0` → `J0S 1B0` (uppercase + canonical space). Was being inserted in lowercase no-space form. Self-tested on 8 representative cases including the actual broken records found in production (LOC-15903, LOC-6227, LOC-4 / C-LPB4). These changes affect only re-imports of locations. Existing data needs a separate backfill script — a follow-up will cover that either as a one-shot migration or by running the existing `reimport_subscriptions.py` after this script.
This commit is contained in:
parent
1186e50bbe
commit
10afd696ae
|
|
@ -57,6 +57,135 @@ def clean(val):
|
|||
return ""
|
||||
return unescape(str(val)).strip()
|
||||
|
||||
|
||||
# ── Address normalization helpers ─────────────────────────────────────────
|
||||
# These exist because the legacy `gestionclient` data is full of:
|
||||
# - postal codes embedded in address_line (e.g. "12 rue X J0S1B0")
|
||||
# - abbreviations ("Ch" instead of "Chemin", "Av." for "Avenue")
|
||||
# - Cobol-style capitalization ("1Re-Concession" instead of "1re-Concession")
|
||||
# - lowercase or no-space postal codes ("j0s1b0" vs "J0S 1B0")
|
||||
# Without normalization these break the RQA address validator (no exact
|
||||
# match), Mapbox geocoder fuzzy results, and the human eye on customer
|
||||
# cards. We catch them at the import step so re-runs converge on the
|
||||
# canonical form.
|
||||
import re
|
||||
|
||||
# Common French / Quebec street-type abbreviations seen in legacy data.
|
||||
# Order matters: longer keys first so "Bd." matches before "B."
|
||||
# Pattern intent: match the abbreviation as a full word ("\b...\b"),
|
||||
# THEN optionally swallow a trailing period. Without the trailing
|
||||
# word-boundary inside, "Boul." would match only "Boul" and leave
|
||||
# the dot behind ("Boulevard."). The order is the dot AFTER the
|
||||
# inner \b so the whole-word check happens first.
|
||||
ABBREV_MAP = [
|
||||
(r'\bCh\b\.?', 'Chemin'),
|
||||
(r'\bRte\b\.?', 'Route'),
|
||||
(r'\bAv\b\.?', 'Avenue'),
|
||||
(r'\bBd\b\.?', 'Boulevard'),
|
||||
(r'\bBoul\b\.?', 'Boulevard'),
|
||||
(r'\bSt[\.-]', 'Saint-'),
|
||||
(r'\bSte[\.-]', 'Sainte-'),
|
||||
(r'\bMtl\b\.?', 'Montréal'),
|
||||
]
|
||||
|
||||
# Words that should stay lowercase even when title-casing the address.
|
||||
# RQA standard puts ordinal markers ("1re", "2e", "3e") in lowercase.
|
||||
LOWER_WORDS = {
|
||||
'de', 'du', 'des', 'la', 'le', 'les', 'l\'', 'd\'',
|
||||
'au', 'aux', 'à', # Rivière-aux-Outardes, Pointe-au-Chêne
|
||||
'et', 'sur', # Saint-Pierre-et-Miquelon, Bois-sur-Rivière
|
||||
'en', # Cap-en-Haut
|
||||
}
|
||||
ORDINAL_RE = re.compile(r'^(\d+)(re|er|e|ere|eme|ème)$', re.I)
|
||||
|
||||
|
||||
def normalize_postal_code(pc):
|
||||
"""Uppercase + strip + insert canonical space: 'j0s1b0' → 'J0S 1B0'."""
|
||||
if not pc:
|
||||
return None
|
||||
s = re.sub(r'\s+', '', str(pc)).upper()
|
||||
if not re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', s):
|
||||
return s or None # malformed; pass through so the rep sees it
|
||||
return s[:3] + ' ' + s[3:]
|
||||
|
||||
|
||||
def clean_address_line(raw, postal_code=None):
|
||||
"""Normalize a legacy address_line for ingestion into Service Location.
|
||||
|
||||
1. Strip the postal code if it leaked into the end of the field
|
||||
(legacy bug that caused 48-char address_line on LPB4's neighbour).
|
||||
2. Expand common Quebec street-type abbreviations.
|
||||
3. Title-case words but keep articles ('de', 'la', 'des') lowercase
|
||||
and ordinal markers ('1re', '2e') lowercase too.
|
||||
4. Collapse runs of whitespace into single spaces.
|
||||
"""
|
||||
if not raw:
|
||||
return ""
|
||||
s = unescape(str(raw)).strip()
|
||||
|
||||
# 1. Strip embedded postal code (with or without space)
|
||||
pc_re = re.compile(r'\s+[A-Z]\d[A-Z]\s?\d[A-Z]\d\s*$', re.I)
|
||||
s = pc_re.sub('', s).strip()
|
||||
# Sometimes the postal code from the dedicated field is doubled
|
||||
if postal_code:
|
||||
pc_clean = re.sub(r'\s+', '', postal_code).upper()
|
||||
s = re.sub(re.escape(pc_clean) + r'\s*$', '', s, flags=re.I).strip()
|
||||
|
||||
# 2. Expand abbreviations
|
||||
for pat, repl in ABBREV_MAP:
|
||||
s = re.sub(pat, repl, s, flags=re.I)
|
||||
|
||||
# 3. Word-by-word title casing with exceptions
|
||||
parts = []
|
||||
for i, word in enumerate(s.split(' ')):
|
||||
if not word:
|
||||
continue
|
||||
lw = word.lower()
|
||||
# Hyphenated chunks: title-case each piece (and apply ordinal rule)
|
||||
if '-' in word:
|
||||
sub = []
|
||||
for chunk in word.split('-'):
|
||||
cl = chunk.lower()
|
||||
m = ORDINAL_RE.match(cl)
|
||||
if m:
|
||||
sub.append(m.group(1) + m.group(2).lower())
|
||||
elif cl in LOWER_WORDS:
|
||||
sub.append(cl)
|
||||
else:
|
||||
sub.append(chunk[:1].upper() + chunk[1:].lower())
|
||||
parts.append('-'.join(sub))
|
||||
continue
|
||||
# Articles stay lowercase except as the first word
|
||||
if i > 0 and lw in LOWER_WORDS:
|
||||
parts.append(lw)
|
||||
continue
|
||||
# Ordinal markers like "1re", "2e" lowercase
|
||||
m = ORDINAL_RE.match(lw)
|
||||
if m:
|
||||
parts.append(m.group(1) + m.group(2).lower())
|
||||
continue
|
||||
parts.append(word[:1].upper() + word[1:].lower())
|
||||
|
||||
# 4. Collapse double spaces
|
||||
out = re.sub(r'\s+', ' ', ' '.join(parts)).strip()
|
||||
return out
|
||||
|
||||
|
||||
def detect_connection_type(devices_for_delivery):
|
||||
"""If the legacy delivery has a fibre device (ONT/CPE/ONU), the SL is on
|
||||
fibre. Without devices we leave it empty — the rep will fill in later.
|
||||
"""
|
||||
if not devices_for_delivery:
|
||||
return None
|
||||
for d in devices_for_delivery:
|
||||
cat = clean(d.get('category', '')).lower()
|
||||
nm = clean(d.get('name', '')).lower()
|
||||
mdl = clean(d.get('model', '')).lower()
|
||||
combined = ' '.join([cat, nm, mdl])
|
||||
if any(k in combined for k in ('ont', 'onu', 'cpe', 'fibre', 'gpon', 'ftth')):
|
||||
return 'Fibre FTTH'
|
||||
return None
|
||||
|
||||
def log(msg):
|
||||
print("[{}] {}".format(datetime.now(timezone.utc).strftime("%H:%M:%S"), msg), flush=True)
|
||||
|
||||
|
|
@ -157,6 +286,17 @@ def main():
|
|||
existing_loc = set(r[0] for r in pgc.fetchall())
|
||||
log(" {} already imported".format(len(existing_loc)))
|
||||
|
||||
# Pre-load device → delivery_id mapping from legacy so we can detect
|
||||
# fibre availability per Service Location at insert time. Keyed by
|
||||
# delivery_id, value = list of devices on that delivery.
|
||||
cur.execute("SELECT delivery_id, category, name, model FROM device WHERE delivery_id IS NOT NULL")
|
||||
devices_by_delivery = {}
|
||||
for row in cur.fetchall():
|
||||
did_dev = row.get('delivery_id')
|
||||
if did_dev:
|
||||
devices_by_delivery.setdefault(did_dev, []).append(row)
|
||||
log(" {} deliveries have at least one device".format(len(devices_by_delivery)))
|
||||
|
||||
# delivery_id → Service Location name mapping (for phases 3-5)
|
||||
del_map = {}
|
||||
loc_ok = loc_skip = loc_err = 0
|
||||
|
|
@ -174,8 +314,19 @@ def main():
|
|||
loc_err += 1
|
||||
continue
|
||||
|
||||
addr = clean(d.get("address1"))
|
||||
city = clean(d.get("city"))
|
||||
# ── Normalize address fields BEFORE insert ──
|
||||
# postal_code: "j0s1b0" → "J0S 1B0" (canonical form with space).
|
||||
# address_line: strip embedded postal codes, expand "Ch."→"Chemin"
|
||||
# etc., title-case with French article rules. The migrated dataset
|
||||
# has 96+ "1Re-Concession"-style entries that the RQA validator
|
||||
# can't match without this pass.
|
||||
postal_norm = normalize_postal_code(clean(d.get("zip")))
|
||||
addr = clean_address_line(d.get("address1"), postal_norm)
|
||||
city_raw = clean(d.get("city"))
|
||||
# City: title-case with the same article rules used for road names
|
||||
# ("Saint-Louis-de-Gonzague" stays correct, "saint-michel" gets
|
||||
# capitalized to "Saint-Michel").
|
||||
city = clean_address_line(city_raw) if city_raw else ""
|
||||
loc_name_display = clean(d.get("name")) or "{}, {}".format(addr, city) if addr else "Location-{}".format(did)
|
||||
loc_id = uid("LOC-")
|
||||
|
||||
|
|
@ -190,6 +341,11 @@ def main():
|
|||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Detect fibre availability from the devices we pre-loaded.
|
||||
# Without devices we leave connection_type empty rather than
|
||||
# guessing — the rep fills it later.
|
||||
conn_type = detect_connection_type(devices_by_delivery.get(did, []))
|
||||
|
||||
try:
|
||||
pgc.execute("""
|
||||
INSERT INTO "tabService Location" (
|
||||
|
|
@ -198,6 +354,7 @@ def main():
|
|||
address_line, city, postal_code, province,
|
||||
latitude, longitude,
|
||||
contact_name, contact_phone,
|
||||
connection_type,
|
||||
legacy_delivery_id
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s, 0, 0,
|
||||
|
|
@ -205,16 +362,18 @@ def main():
|
|||
%s, %s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s,
|
||||
%s,
|
||||
%s
|
||||
)
|
||||
""", (loc_id, now, now, ADMIN, ADMIN,
|
||||
cust_id, loc_name_display[:140],
|
||||
addr or "N/A", city or "N/A",
|
||||
clean(d.get("zip")) or None,
|
||||
postal_norm,
|
||||
clean(d.get("state")) or "QC",
|
||||
lat, lon,
|
||||
clean(d.get("contact")) or None,
|
||||
clean(d.get("tel_home")) or clean(d.get("cell")) or None,
|
||||
conn_type,
|
||||
did))
|
||||
|
||||
del_map[did] = loc_id
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user