【发布时间】:2022-08-14 17:18:21
【问题描述】:
我正在使用 AG News Dataset 来训练模型以使用文本分类。
使用TabularDataset 从csv 文件生成数据集的部分。
import torchtext
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
import spacy
def des_tokenize(x):
return x.split(\' \')
def title_tokenize(x):
return x.split(\' \')
def category_tokenize(x):
return x
device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")
CATEGORY = Field(tokenize=category_tokenize)
TITLE = Field(tokenize=title_tokenize, init_token=\'<SOS>\', eos_token=\'<EOS>\')
DES = Field(tokenize=des_tokenize, init_token=\'<SOS>\', eos_token=\'<EOS>\')
spacy_en = spacy.load(\'en_core_web_sm\')
train_fields = [(\'id\', None), (\'category\', CATEGORY), (\'title\', TITLE), (\'description\', DES)]
test_fields = [(\'title\', TITLE), (\'description\', DES)]
train_data = TabularDataset(
path = \'/content/drive/MyDrive/summer2/train.csv\',
format = \'csv\',
fields = train_fields,
skip_header = True)
test_data = TabularDataset(
path = \'/content/drive/MyDrive/summer2/test.csv\',
format = \'csv\',
fields = test_fields,
skip_header = True)
生成数据集后,选择使用预训练嵌入模型,称为
torchtext.vocab.GloVe 构建 vocab。
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
train_batch_size = 10
test_batch_size = 1
max_length = 256
tokenizer = get_tokenizer(\'basic_english\')
train_iter = torchtext.legacy.data.BucketIterator(
train_data,
batch_size=train_batch_size,
)
test_iter = torchtext.legacy.data.BucketIterator(
test_data,
batch_size=test_batch_size,
)
DES.build_vocab(
train_data,
vectors=torchtext.vocab.GloVe(name=\"6B\", dim=50, max_vectors=50_000),
max_size=50_000,
)
TITLE.build_vocab(
train_data,
vectors=torchtext.vocab.GloVe(name=\"6B\", dim=50, max_vectors=50_000),
max_size=50_000,
)
CATEGORY.build_vocab(train_data)
create_batches 函数后的输出看起来很棒
def create_batches(self):
self.batches = batch(self.data(), self.batch_size, self.batch_size_fn)
# Create batches - needs to be called before each loop.
train_iter.create_batches()
# Loop through BucketIterator.
print(\'PyTorchText BuketIterator\\n\')
for batch in train_iter.batches:
# Let\'s check batch size.
print(\'Batch size: %d\\n\'% len(batch))
print(\'category\\ttitle\\tdescription\'.ljust(10))
# Print each example.
for example in batch:
print(\'%s \\t %s \\t %s\'.ljust(10) % (example.category, example.title, example.description))
print(\'\\n\')
# Only look at first batch. Reuse this code in training models.
break
输出看起来像
PyTorchText BuketIterator
Batch size: 10
category title description
2 [\'UPDATE\', \'1-Open-Rejuvenated\', \'Haas\', \'reaches\', \'last\', \'eight\'] [\'Germany\', \'#39;s\', \'Tommy\', \'Haas\', \'continued\', \'his\', \'resurgence\', \'with\', \'a\', \'7-6\', \'6-1\', \'7-5\', \'victory\', \'over\', \'Czech\', \'teenager\', \'Tomas\', \'Berdych\', \'on\', \'Tuesday\', \'to\', \'reach\', \'the\', \'quarter-finals\', \'of\', \'the\', \'US\', \'Open\', \'for\', \'the\', \'first\', \'time.\']
3 [\'Japan\', \'#39;s\', \'Nikkei\', \'Average,\', \'Topix\', \'Advance;\', \'Toyota,\', \'Advantest\', \'Gain\'] [\'Japan\', \'#39;s\', \'Nikkei\', \'225\', \'Stock\', \'Average\', \'rose\', \'56.74,\', \'or\', \'0.5\', \'percent,\', \'to\', \'11,139.97\', \'at\', \'9:01\', \'am\', \'in\', \'Tokyo.\', \'The\', \'broader\', \'Topix\', \'index\', \'gained\', \'5.35,\', \'or\', \'0.5\', \'percent,\', \'to\', \'1132.\']
2 [\'Wildcats\', \'on\', \'the\', \'rise\', \'with\', \'Santos\'] [\'The\', \'University\', \'of\', \'New\', \"Hampshire\'s\", \'impressive\', \'51-40\', \'road\', \'victory\', \'over\', \'10th-ranked\', \'Villanova\', \'Saturday\', \'night\', \'vaulted\', \'the\', \'Wildcats\', \'three\', \'spots\', \'to\', \'ninth\', \'in\', \'this\', \"week\'s\", \'Sports\', \'Network\', \'1-AA\', \'football\', \'poll,\', \'while\', \'dropping\', \'Villanova\', \'to\', \'14th.\']
1 [\'Cracking\', \'under\', \'the\', \'strain\'] [\'Severe\', \'cracks\', \'surfaced\', \'inside\', \'the\', \'Israeli\', \'government\', \'this\', \'week\', \'as\', \'its\', \'senior\', \'law\', \'officers\', \'publicly\', \'fell\', \'out\', \'with\', \'the\', \'defence\', \'establishment\', \'and\', \'the\', \'Foreign\', \'Ministry\', \'over\', \'the\', \'country\', \'#39;s\', \'future\', \'strategy\', \'in\', \'the\', \'face\', \'of\', \'the\', \'July\', \'verdict\', \'of\', \'the\', \'International\', \'\']
1 [\'Arab\', \'League\', \'to\', \'hold\', \'emergency\', \'meeting\'] [\'The\', \'Arab\', \'League\', \'says\', \'it\', \'will\', \'hold\', \'an\', \'emergency\', \'session\', \'to\', \'discuss\', \'the\', \'violence\', \'in\', \'Gaza,\', \'which\', \'has\', \'claimed\', \'at\', \'least\', \'56\', \'Palestinians\', \'this\', \'week.\']
2 [\'Holmes\', \'to\', \'decide\', \'on\', \'double\'] [\'Kelly\', \'Holmes\', \'has\', \'still\', \'to\', \'confirm\', \'whether\', \'she\', \'will\', \'attempt\', \'to\', \'repeat\', \'her\', \'Olympic\', \'double\', \'at\', \'this\', \'weekend\', \'#39;s\', \'World\', \'Athletics\', \'Final\', \'after\', \'clearing\', \'the\', \'first\', \'hurdle\', \'with\', \'a\', \'victory\', \'in\', \'the\', \'1500m\', \'yesterday.\']
2 [\'NBA\', \'suspends\', \'nine\', \'players,\', \'Artest\', \'for\', \'rest\', \'of\', \'season\'] [\'NBA\', \'on\', \'Sunday\', \'suspended\', \'nine\', \'players\', \'for\', \'involving\', \'in\', \'a\', \'melee\', \'during\', \'Friday\', \'#39;s\', \'game\', \'between\', \'Detorit\', \'Pistons\', \'and\', \'Indiana\', \'Pacers,\', \'with\', \'Ron\', \'Artest\', \'suspended\', \'for\', \'the\', \'rest\', \'of\', \'the\', \'season,\', \'73\', \'games.\']
2 [\'On\', \'the\', \'Far\', \'Side\', \'of\', \'the\', \'Field,\', \'a\', \'Familiar\', \'Face\'] [\'Perhaps\', \'there\', \'will\', \'be\', \'a\', \'moment\', \'during\', \"Sunday\'s\", \'game\', \'between\', \'the\', \'Giants\', \'and\', \'the\', \'Redskins\', \'when\', \'a\', \'coach\', \'and\', \'his\', \'former\', \'franchise\', \'quarterback\', \'will\', \'do\', \'a\', \'double\', \'take.\']
3 [\'\', \'#39;QUIET\', \'#39;\', \'RULE\', \'MAY\', \'CHANGE\'] [\'The\', \'Securities\', \'and\', \'Exchange\', \'Commission\', \'wants\', \'to\', \'scrap\', \'a\', \'1933\', \'rule\', \'that\', \'forces\', \'a\', \'strict\', \'\', \'quot;quiet\', \'period\', \'quot;\', \'on\', \'all\', \'talk\', \'about\', \'a\', \'company\', \'just\', \'prior\', \'to\', \'its\', \'stock\', \'being\', \'sold\', \'initially\', \'to\', \'the\', \'public.\']
2 [\'Denehy\', \'boosts\', \'Walpole\', \'\'] [\'Danvers\', \'coach\', \'thought\', \'he\', \'had\', \'the\', \'perfect\', \'game\', \'plan\', \'against\', \'Walpole\', \'last\', \'night\', \'in\', \'the\', \'Division\', \'2\', \'playoffs\', \'at\', \'Endicott\', \'College.\', \'It\', \'was\', \'the\', \'same\', \'game\', \'plan\', \'that\', \'earned\', \'his\', \'team\', \'its\', \'first\', \'playoff\', \'berth\', \'in\', \'63\', \'years.\']
问题是,如果我使用 build_vocab_from_iterator 创建迭代器会怎样?
我使用 BucketIterator 的部分之间的功能是否具有相同的含义?
另外,我认为在这项工作中使用 Pretrained Word Embeddings GloVe 比 FastText 更好,因为模型需要分类描述是哪些类型。
标签: python machine-learning nlp pytorch vocabulary