diff --git a/ftfy/badness.py b/ftfy/badness.py index 38ec1f4..2272c54 100644 --- a/ftfy/badness.py +++ b/ftfy/badness.py @@ -364,6 +364,12 @@ | ^[ÃÂ][ ] | + # Upper-accented letter followed by a currency symbol at the very + # start of the string (otherwise usually requires a preceding space). + # Require a word character after the pair so the pattern does not match + # the isolated 2-character substring inside decode_inconsistent_utf8. + ^[{upper_accented}][{currency}]\w + | # Cases where  precedes a character as an encoding of exactly the same # character, and the character is common enough diff --git a/tests/test-cases/synthetic.json b/tests/test-cases/synthetic.json index a939311..14d4e33 100644 --- a/tests/test-cases/synthetic.json +++ b/tests/test-cases/synthetic.json @@ -204,5 +204,12 @@ "original": "OÙ ET QUAND?", "fixed": "OÙ ET QUAND?", "expect": "pass" + }, + { + "label": "Synthetic: mojibake at the beginning of a string (Ã¥ for å)", + "comment": "issue #222: å mojibake not detected at the very start of a string. The badness heuristic missed Ã¥ when a currency symbol followed an upper-accented letter without a preceding space.", + "original": "Ã¥klagarmyndighets", + "fixed": "åklagarmyndighets", + "expect": "pass" } ] \ No newline at end of file