diff --git a/scripts/migration/migrate_locations.py b/scripts/migration/migrate_locations.py index 18a647b..56e3b3c 100644 --- a/scripts/migration/migrate_locations.py +++ b/scripts/migration/migrate_locations.py @@ -57,6 +57,135 @@ def clean(val): return "" return unescape(str(val)).strip() + +# ── Address normalization helpers ───────────────────────────────────────── +# These exist because the legacy `gestionclient` data is full of: +# - postal codes embedded in address_line (e.g. "12 rue X J0S1B0") +# - abbreviations ("Ch" instead of "Chemin", "Av." for "Avenue") +# - Cobol-style capitalization ("1Re-Concession" instead of "1re-Concession") +# - lowercase or no-space postal codes ("j0s1b0" vs "J0S 1B0") +# Without normalization these break the RQA address validator (no exact +# match), Mapbox geocoder fuzzy results, and the human eye on customer +# cards. We catch them at the import step so re-runs converge on the +# canonical form. +import re + +# Common French / Quebec street-type abbreviations seen in legacy data. +# Order matters: longer keys first so "Bd." matches before "B." +# Pattern intent: match the abbreviation as a full word ("\b...\b"), +# THEN optionally swallow a trailing period. Without the trailing +# word-boundary inside, "Boul." would match only "Boul" and leave +# the dot behind ("Boulevard."). The order is the dot AFTER the +# inner \b so the whole-word check happens first. +ABBREV_MAP = [ + (r'\bCh\b\.?', 'Chemin'), + (r'\bRte\b\.?', 'Route'), + (r'\bAv\b\.?', 'Avenue'), + (r'\bBd\b\.?', 'Boulevard'), + (r'\bBoul\b\.?', 'Boulevard'), + (r'\bSt[\.-]', 'Saint-'), + (r'\bSte[\.-]', 'Sainte-'), + (r'\bMtl\b\.?', 'Montréal'), +] + +# Words that should stay lowercase even when title-casing the address. +# RQA standard puts ordinal markers ("1re", "2e", "3e") in lowercase. +LOWER_WORDS = { + 'de', 'du', 'des', 'la', 'le', 'les', 'l\'', 'd\'', + 'au', 'aux', 'à', # Rivière-aux-Outardes, Pointe-au-Chêne + 'et', 'sur', # Saint-Pierre-et-Miquelon, Bois-sur-Rivière + 'en', # Cap-en-Haut +} +ORDINAL_RE = re.compile(r'^(\d+)(re|er|e|ere|eme|ème)$', re.I) + + +def normalize_postal_code(pc): + """Uppercase + strip + insert canonical space: 'j0s1b0' → 'J0S 1B0'.""" + if not pc: + return None + s = re.sub(r'\s+', '', str(pc)).upper() + if not re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', s): + return s or None # malformed; pass through so the rep sees it + return s[:3] + ' ' + s[3:] + + +def clean_address_line(raw, postal_code=None): + """Normalize a legacy address_line for ingestion into Service Location. + + 1. Strip the postal code if it leaked into the end of the field + (legacy bug that caused 48-char address_line on LPB4's neighbour). + 2. Expand common Quebec street-type abbreviations. + 3. Title-case words but keep articles ('de', 'la', 'des') lowercase + and ordinal markers ('1re', '2e') lowercase too. + 4. Collapse runs of whitespace into single spaces. + """ + if not raw: + return "" + s = unescape(str(raw)).strip() + + # 1. Strip embedded postal code (with or without space) + pc_re = re.compile(r'\s+[A-Z]\d[A-Z]\s?\d[A-Z]\d\s*$', re.I) + s = pc_re.sub('', s).strip() + # Sometimes the postal code from the dedicated field is doubled + if postal_code: + pc_clean = re.sub(r'\s+', '', postal_code).upper() + s = re.sub(re.escape(pc_clean) + r'\s*$', '', s, flags=re.I).strip() + + # 2. Expand abbreviations + for pat, repl in ABBREV_MAP: + s = re.sub(pat, repl, s, flags=re.I) + + # 3. Word-by-word title casing with exceptions + parts = [] + for i, word in enumerate(s.split(' ')): + if not word: + continue + lw = word.lower() + # Hyphenated chunks: title-case each piece (and apply ordinal rule) + if '-' in word: + sub = [] + for chunk in word.split('-'): + cl = chunk.lower() + m = ORDINAL_RE.match(cl) + if m: + sub.append(m.group(1) + m.group(2).lower()) + elif cl in LOWER_WORDS: + sub.append(cl) + else: + sub.append(chunk[:1].upper() + chunk[1:].lower()) + parts.append('-'.join(sub)) + continue + # Articles stay lowercase except as the first word + if i > 0 and lw in LOWER_WORDS: + parts.append(lw) + continue + # Ordinal markers like "1re", "2e" lowercase + m = ORDINAL_RE.match(lw) + if m: + parts.append(m.group(1) + m.group(2).lower()) + continue + parts.append(word[:1].upper() + word[1:].lower()) + + # 4. Collapse double spaces + out = re.sub(r'\s+', ' ', ' '.join(parts)).strip() + return out + + +def detect_connection_type(devices_for_delivery): + """If the legacy delivery has a fibre device (ONT/CPE/ONU), the SL is on + fibre. Without devices we leave it empty — the rep will fill in later. + """ + if not devices_for_delivery: + return None + for d in devices_for_delivery: + cat = clean(d.get('category', '')).lower() + nm = clean(d.get('name', '')).lower() + mdl = clean(d.get('model', '')).lower() + combined = ' '.join([cat, nm, mdl]) + if any(k in combined for k in ('ont', 'onu', 'cpe', 'fibre', 'gpon', 'ftth')): + return 'Fibre FTTH' + return None + def log(msg): print("[{}] {}".format(datetime.now(timezone.utc).strftime("%H:%M:%S"), msg), flush=True) @@ -157,6 +286,17 @@ def main(): existing_loc = set(r[0] for r in pgc.fetchall()) log(" {} already imported".format(len(existing_loc))) + # Pre-load device → delivery_id mapping from legacy so we can detect + # fibre availability per Service Location at insert time. Keyed by + # delivery_id, value = list of devices on that delivery. + cur.execute("SELECT delivery_id, category, name, model FROM device WHERE delivery_id IS NOT NULL") + devices_by_delivery = {} + for row in cur.fetchall(): + did_dev = row.get('delivery_id') + if did_dev: + devices_by_delivery.setdefault(did_dev, []).append(row) + log(" {} deliveries have at least one device".format(len(devices_by_delivery))) + # delivery_id → Service Location name mapping (for phases 3-5) del_map = {} loc_ok = loc_skip = loc_err = 0 @@ -174,8 +314,19 @@ def main(): loc_err += 1 continue - addr = clean(d.get("address1")) - city = clean(d.get("city")) + # ── Normalize address fields BEFORE insert ── + # postal_code: "j0s1b0" → "J0S 1B0" (canonical form with space). + # address_line: strip embedded postal codes, expand "Ch."→"Chemin" + # etc., title-case with French article rules. The migrated dataset + # has 96+ "1Re-Concession"-style entries that the RQA validator + # can't match without this pass. + postal_norm = normalize_postal_code(clean(d.get("zip"))) + addr = clean_address_line(d.get("address1"), postal_norm) + city_raw = clean(d.get("city")) + # City: title-case with the same article rules used for road names + # ("Saint-Louis-de-Gonzague" stays correct, "saint-michel" gets + # capitalized to "Saint-Michel"). + city = clean_address_line(city_raw) if city_raw else "" loc_name_display = clean(d.get("name")) or "{}, {}".format(addr, city) if addr else "Location-{}".format(did) loc_id = uid("LOC-") @@ -190,6 +341,11 @@ def main(): except (ValueError, TypeError): pass + # Detect fibre availability from the devices we pre-loaded. + # Without devices we leave connection_type empty rather than + # guessing — the rep fills it later. + conn_type = detect_connection_type(devices_by_delivery.get(did, [])) + try: pgc.execute(""" INSERT INTO "tabService Location" ( @@ -198,6 +354,7 @@ def main(): address_line, city, postal_code, province, latitude, longitude, contact_name, contact_phone, + connection_type, legacy_delivery_id ) VALUES ( %s, %s, %s, %s, %s, 0, 0, @@ -205,16 +362,18 @@ def main(): %s, %s, %s, %s, %s, %s, %s, %s, + %s, %s ) """, (loc_id, now, now, ADMIN, ADMIN, cust_id, loc_name_display[:140], addr or "N/A", city or "N/A", - clean(d.get("zip")) or None, + postal_norm, clean(d.get("state")) or "QC", lat, lon, clean(d.get("contact")) or None, clean(d.get("tel_home")) or clean(d.get("cell")) or None, + conn_type, did)) del_map[did] = loc_id