From 1b6e13e77ef4abffe66f2145acb3537bd3deb6bf Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Wed, 24 Jun 2026 12:25:14 +0200 Subject: [PATCH] fix: detect upper-accented+currency mojibake at string start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an upper-accented letter (such as Ã) is followed by a currency symbol (such as ¥, YEN SIGN) at the very beginning of a string, the badness heuristic failed to detect it as mojibake. An existing pattern (`\s [{upper_accented}] [{currency}]`) already caught this case when a preceding whitespace was present, but the start-of-string case was missing. Add a BADNESS_RE pattern `^[{upper_accented}][{currency}]\w` that matches this sequence at position 0 only when followed by a word character. The trailing `\w` ensures the pattern does not match the isolated 2-character substring that `decode_inconsistent_utf8` passes to `is_bad()` during processing of other text, preventing false positives on ambiguous embedded sequences like "DrÃ¥ber". Fixes #222. --- ftfy/badness.py | 6 ++++++ tests/test-cases/synthetic.json | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/ftfy/badness.py b/ftfy/badness.py index 38ec1f4..2272c54 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -364,6 +364,12 @@ | ^[ÃÂ][ ] | + # Upper-accented letter followed by a currency symbol at the very + # start of the string (otherwise usually requires a preceding space). + # Require a word character after the pair so the pattern does not match + # the isolated 2-character substring inside decode_inconsistent_utf8. + ^[{upper_accented}][{currency}]\w + | # Cases where  precedes a character as an encoding of exactly the same # character, and the character is common enough diff --git a/tests/test-cases/synthetic.json b/tests/test-cases/synthetic.json index a939311..14d4e33 100644 --- a/tests/test-cases/synthetic.json +++ b/tests/test-cases/synthetic.json @@ -204,5 +204,12 @@ "original": "OÙ ET QUAND?", "fixed": "OÙ ET QUAND?", "expect": "pass" + }, + { + "label": "Synthetic: mojibake at the beginning of a string (Ã¥ for å)", + "comment": "issue #222: å mojibake not detected at the very start of a string. The badness heuristic missed Ã¥ when a currency symbol followed an upper-accented letter without a preceding space.", + "original": "Ã¥klagarmyndighets", + "fixed": "åklagarmyndighets", + "expect": "pass" } ] \ No newline at end of file