根据turpachull 的回答和python3 list of standard encodings(&Mark Amery 的answer listing the set for various versions of python),这里有一个脚本,它将尝试在标准输入上进行每个编码转换,并输出每个版本(如果它与普通版本不同) utf_8.
#!/usr/bin/env python3
import sys
import fileinput
encodings = ["ascii", "big5hkscs", "cp1006", "cp1125", "cp1250", "cp1252", "cp1254", "cp1256", "cp1258", "cp273", "cp437", "cp720", "cp775", "cp852", "cp856", "cp858", "cp861", "cp863", "cp865", "cp869", "cp875", "cp949", "euc_jis_2004", "euc_kr", "gbk", "hz", "iso2022_jp_1", "iso2022_jp_2004", "iso2022_jp_ext", "iso8859_11", "iso8859_14", "iso8859_16", "iso8859_3", "iso8859_5", "iso8859_7", "iso8859_9", "koi8_r", "koi8_u", "latin_1", "mac_cyrillic", "mac_iceland", "mac_roman", "ptcp154", "shift_jis_2004", "utf_16_be", "utf_32", "utf_32_le", "utf_7", "utf_8_sig", "big5", "cp037", "cp1026", "cp1140", "cp1251", "cp1253", "cp1255", "cp1257", "cp424", "cp500", "cp737", "cp850", "cp855", "cp857", "cp860", "cp862", "cp864", "cp866", "cp874", "cp932", "cp950", "euc_jisx0213", "euc_jp", "gb18030", "gb2312", "iso2022_jp", "iso2022_jp_2", "iso2022_jp_3", "iso2022_kr", "iso8859_10", "iso8859_13", "iso8859_15", "iso8859_2", "iso8859_4", "iso8859_6", "iso8859_8", "johab", "koi8_t", "kz1048", "mac_greek", "mac_latin2", "mac_turkish", "shift_jis", "shift_jisx0213", "utf_16", "utf_16_le", "utf_32_be", "utf_8"]
def maybe_fix_encoding(utf8_string, possible_codec="utf_8"):
try:
return utf8_string.encode(possible_codec).decode('utf_8')
except UnicodeError:
return utf8_string
for line in sys.stdin:
for e in encodings:
i=line.rstrip('\n')
result=maybe_fix_encoding(i, e)
if result != i or e == 'utf_8':
print("\t".join([e, result]))
print("\n")
用法例如:
$ echo 'Requiem der morgenröte' | ~/decode_string.py
cp1252 Requiem der morgenröte
cp1254 Requiem der morgenröte
iso2022_jp_1 Requiem der morgenr(D**B"yte
iso2022_jp_2 Requiem der morgenr(D**B"yte
iso2022_jp_2004 Requiem der morgenr(Q):B"yte
iso2022_jp_3 Requiem der morgenr(O):B"yte
iso2022_jp_ext Requiem der morgenr(D**B"yte
latin_1 Requiem der morgenröte
iso8859_9 Requiem der morgenröte
iso8859_14 Requiem der morgenröte
iso8859_15 Requiem der morgenröte
mac_iceland Requiem der morgenr̦te
mac_roman Requiem der morgenr̦te
mac_turkish Requiem der morgenr̦te
utf_7 Requiem der morgenr+AMMAtg-te
utf_8 Requiem der morgenröte
utf_8_sig Requiem der morgenröte