在进行了一些测试之后,我可以说str.translate() 是最好的变体。
输入数据:
symbols = {"`", "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "-", "+", "=", "{", "[", "]", "}", "|", "\\", ":", ";", "\"", "<", ",", ">", ".", "?", "/"}
translate_table = {126: None, 93: None, 91: None, 125: None, 92: None, 42: None, 45: None, 94: None, 62: None, 47: None, 35: None, 59: None, 44: None, 58: None, 60: None, 124: None, 61: None, 36: None, 95: None, 43: None, 96: None, 123: None, 64: None, 33: None, 38: None, 63: None, 46: None, 34: None, 41: None, 37: None, 40: None}
regular_expression = "[`~!@#$%^&*()_\-+={[\]}|\\:;\"<,>.?/]"
small_document = "Some**r@an]]\"dom t##xt"
normal_document = "TbsX^Kt$FZ%haZe+sLxu:Al\"xNAL\\Kix[mHp_gn]PrG`DqGd~GdNc;BoEq.SYD?Rp>ukq,UfO<XdTc=RUH}oifc&oP!CB*me@Qv{Qf-Li)gmXL/IQH#mne(Khaj|"
big_document = "QOfY+dymyoGBAxTAoIeM+jEWlaECUZEUXuMvprJOqFtQR*OiHtTFZkUNbYipSTTDPOVkIdGTcjWrQmbmthKBHBSEOZ)lQAIJOrVgmGGFdtqbuFfj<Dls<JWtKczAFMPYMemiJBJHdPeeul\\x>lGIBvUsxBokagvVovrrdxdKMtAKx>MEexYv>DGqPUXYaBQKwiSIUobrPQYjilhHMQunE;RiqOZPTnyOEgRrpxcuobvvmGkFpTqgMxYYhrmRRnauiqgvCmZ\"UauceaXsgAMSakxewzPrlIrYkVCVZaEGh]qiizYyzbkcHPF@qQsQMfHPDEbEnWtrCFoARUYAloOcctqmL@hegZbfhsHaJOxOxzQhZAVjVDgokosATfhKMT!WYyPWKcKAHKCzQGGJOCglYGZbftsuyntXZUKNqgGlsLJqgN,pUcOoA/tStXFXgpoSErgvw/OUMPWjJwt=bhMAIDayOZXJm=ifYYUuAvSIZjwnBfktNvEvZmvQso%HiNZEVqoDR%nQBtCkhjSfVfDuRSRsvp-sCunjDDUYSEVLICQdisxhEfqkUTkiPlLiUNNwrvO#WTDmweZyMeIbgNXkIsvaJeHYXV(HvRcGNZM(PPRIAyyLWivGiqMVBtwObqLfEEISyyjGNEdUU:ys`dXcVawkIEAjFXky`RUXNTm`LDM}mwTOcmsSo}haJXPnkwOhKLYwve}SWifzKq}grw}fMSQXXWguUQtlWpPZQymR^wBKEyolFlZnzEEmehSNenOqDOHWRit[Npm?R?DIPXAmQYYBbmJofxUzzWBsVCoPI?VmpXhoMxCfXyHEHowXzIJvExThiffLhBTtma_jk_NrbkPCGGypXvOuBqBxDYfC{bwIHoaqnJSKytxwWXBNnKG~PKuQklGblEwH~rJoGpKZmm~tTEFnPLdmzfrqJibMYIykzL$RZLPmsZjB$AAbZwFnByOydEOIfFvTaEQaSjbpeBZuUGY&ZfPQgLihmPYrhZxSwMzLrNF.WjFiDCLyXksdkLeMHVCfrdgCAotElQ|"
no_match_document = "XOtasggWqhtSLJpHEGoCmMRepFBlRfAGKTLPcEtKonFVsPgvWgAbvJVeMWILPgLapwAmTgXWVbxOJtUFmMygzIqYPqyAxzwElTFyYcGdtnNa"
代码:
def func1(doc):
for c in symbols:
doc = doc.replace(c, "")
return doc
def func2(doc):
return doc.translate(translate_table)
def func3(doc):
return re.sub(regular_expression, "", doc)
def func4(doc):
return "".join(c for c in doc if c not in symbols)
测试结果:
func1(small_document): 0.701037002
func1(normal_document): 1.1260866900000002
func1(big_document): 3.4234831459999997
func1(no_match_document): 0.7740780450000004
func2(small_document): 0.14135037500000003
func2(normal_document): 0.5368806810000004
func2(big_document): 0.8128472860000002
func2(no_match_document): 0.394245089
func3(small_document): 0.3157141610000007
func3(normal_document): 0.927359323000001
func3(big_document): 1.9310377590000005
func3(no_match_document): 0.18656399199999996
func4(small_document): 0.3034549070000008
func4(normal_document): 1.3695875739999988
func4(big_document): 10.115730064
func4(no_match_document): 1.2086623230000022
UPD。
我提供的输入数据是专门为纯方法测试“准备的”。
为了生成translate_table,我使用了下一个字典理解:
translate_table = {ord(s): None for s in symbols}
Here 是用于正则表达式验证的网站链接(它可能会有所帮助)。
如果你想自己重新计算测试,这里是代码:
if __name__ == '__main__':
import timeit
print("func1(small_document)", timeit.timeit("func1(small_document)", setup="from __main__ import func1, small_document", number=100000))
print("func1(normal_document): ", timeit.timeit("func1(normal_document)", setup="from __main__ import func1, normal_document", number=100000))
print("func1(big_document): ", timeit.timeit("func1(big_document)", setup="from __main__ import func1, big_document", number=100000))
print("func1(no_match_document): ", timeit.timeit("func1(no_match_document)", setup="from __main__ import func1, no_match_document", number=100000))
print("func2(small_document): ", timeit.timeit("func2(small_document)", setup="from __main__ import func2, small_document", number=100000))
print("func2(normal_document): ", timeit.timeit("func2(normal_document)", setup="from __main__ import func2, normal_document", number=100000))
print("func2(big_document): ", timeit.timeit("func2(big_document)", setup="from __main__ import func2, big_document", number=100000))
print("func2(no_match_document): ", timeit.timeit("func2(no_match_document)", setup="from __main__ import func2, no_match_document", number=100000))
print("func3(small_document): ", timeit.timeit("func3(small_document)", setup="from __main__ import func3, small_document", number=100000))
print("func3(normal_document): ", timeit.timeit("func3(normal_document)", setup="from __main__ import func3, normal_document", number=100000))
print("func3(big_document): ", timeit.timeit("func3(big_document)", setup="from __main__ import func3, big_document", number=100000))
print("func3(no_match_document): ", timeit.timeit("func3(no_match_document)", setup="from __main__ import func3, no_match_document", number=100000))
print("func4(small_document): ", timeit.timeit("func4(small_document)", setup="from __main__ import func4, small_document", number=100000))
print("func4(normal_document): ", timeit.timeit("func4(normal_document)", setup="from __main__ import func4, normal_document", number=100000))
print("func4(big_document): ", timeit.timeit("func4(big_document)", setup="from __main__ import func4, big_document", number=100000))
print("func4(no_match_document): ", timeit.timeit("func4(no_match_document)", setup="from __main__ import func4, no_match_document", number=100000))