几乎不存在训练精度和低测试精度答案

【问题标题】：Almost non-existent training accuracy and low test accuracy几乎不存在训练精度和低测试精度
【发布时间】：2021-09-14 10:17:42
【问题描述】：

我对机器学习真的很陌生，而且我一般不太精通编码。但是，需要查看我们商店的客户反馈，平均每年都很多，但我们无法区分正面、负面和中性的百分比。

目前我正在尝试训练一个 Bert 模型来进行简单的多标签情绪分析。输入是我们商店的客户反馈。客户的反馈并不总是那么明确，因为客户确实倾向于长时间地讲述他们的经历，而且他们的情绪并不总是那么清楚。然而，我们设法获得了正面、负面和中性，每组 2247 个样本。

但是当我尝试训练它时，训练准确率大约是 0.4%，这是超低的。验证分数约为 60%。每个标签的 F1 分数约为 60%。我想知道可以做些什么来提高这种训练的准确性。我被困了一段时间。请查看我的代码并帮助我解决此问题。

我尝试过改变学习率（尝试了 Bert 建议的所有学习率和 1e-5），改变 Max LEN，改变 EPOCH 的数量，改变退出率（0.1、0.2、0.3、0.4、0.5），但到目前为止没有任何结果。

#read dataset
df = pd.read_csv("data.csv",header=None, names=['content', 'sentiment'], sep='\;', lineterminator='\r',encoding = "ISO-8859-1",engine="python")

from sklearn.utils import shuffle

df = shuffle(df)

df['sentiment'] = df['sentiment'].replace(to_replace = [-1, 0, 1], value = [0, 1, 2])

df.head()

#Load pretrained FinBert model and get bert tokenizer from it
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#Choose sequence Length
token_lens = []

for txt in df.content:

 tokens = tokenizer.encode(txt, max_length=512)

 token_lens.append(len(tokens))

sns.distplot(token_lens)

plt.xlim([0, 256]);

plt.xlabel('Token count');

MAX_LEN = 260


#Make a PyTorch dataset
class FIDataset(Dataset):

 def __init__(self, texts, targets, tokenizer, max_len):

   self.texts = texts

   self.targets = targets

   self.tokenizer = tokenizer

   self.max_len = max_len

 def __len__(self):

   return len(self.texts)

 def __getitem__(self, item):

   text = str(self.texts[item])

   target = self.targets[item]

   encoding = self.tokenizer.encode_plus(

     text,

     add_special_tokens=True,

     max_length=self.max_len,

     return_token_type_ids=False,

     pad_to_max_length=True,

     return_attention_mask=True,

     return_tensors='pt',

   )

   return {

     'text': text,

     'input_ids': encoding['input_ids'].flatten(),

     'attention_mask': encoding['attention_mask'].flatten(),

     'targets': torch.tensor(target, dtype=torch.long)

   }

#split test and train
df_train, df_test = train_test_split(

 df,

 test_size=0.1,

 random_state=RANDOM_SEED

)

df_val, df_test = train_test_split(

 df_test,

 test_size=0.5,

 random_state=RANDOM_SEED

)

df_train.shape, df_val.shape, df_test.shape

#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):

 ds = FIDataset(

   texts=df.content.to_numpy(),

   targets=df.sentiment.to_numpy(),

   tokenizer=tokenizer,

   max_len=max_len

 )

 return DataLoader(

   ds,

   batch_size=batch_size,

   num_workers=4

 )

#Load data into train, test, val
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)


# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):

 def __init__(self, n_classes):

   super(SentimentClassifier, self).__init__()

   self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

   self.drop = nn.Dropout(p=0.1)

   self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

 def forward(self, input_ids, attention_mask):
   returned = self.bert(
       
       input_ids=input_ids,
       attention_mask=attention_mask
   )
   pooled_output = returned["pooler_output"]
   output = self.drop(pooled_output)
   
   return self.out(output)


#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)

model = model.to(device)


#Optimize with AdamW
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr= 2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(

 optimizer,

 num_warmup_steps=0,

 num_training_steps=total_steps

)

loss_fn = nn.CrossEntropyLoss().to(device)

#Train each Epoch function
def train_epoch(

 model,

 data_loader,

 loss_fn,

 optimizer,

 device,

 scheduler,

 n_examples

):
 
 model = model.train()

 losses = []

 correct_predictions = 0

 for d in data_loader:

   input_ids = d["input_ids"].to(device)

   attention_mask = d["attention_mask"].to(device)

   targets = d["targets"].to(device)

   outputs = model(

     input_ids=input_ids,

     attention_mask=attention_mask

   )

   _, preds = torch.max(outputs, dim=1)

   loss = loss_fn(outputs, targets)

   correct_predictions += torch.sum(preds == targets)

   losses.append(loss.item())

   loss.backward()

   nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

   optimizer.step()

   scheduler.step()

   optimizer.zero_grad()
   
   return correct_predictions.double() / n_examples, np.mean(losses)

#Eval model function
def eval_model(model, data_loader, loss_fn, device, n_examples):

 model = model.eval()

 losses = []

 correct_predictions = 0

 with torch.no_grad():
       
   torch.cuda.empty_cache()

   for d in data_loader:

     input_ids = d["input_ids"].to(device)

     attention_mask = d["attention_mask"].to(device)

     targets = d["targets"].to(device)

     outputs = model(

       input_ids=input_ids,

       attention_mask=attention_mask

     )

     _, preds = torch.max(outputs, dim=1)

     loss = loss_fn(outputs, targets)

     correct_predictions += torch.sum(preds == targets)

     losses.append(loss.item())
       
   
   
 return correct_predictions.double() / n_examples, np.mean(losses)

#training loop through each epochs

import torch

torch.cuda.empty_cache()

history = defaultdict(list)

best_accuracy = 0

if __name__ == '__main__':  

   for epoch in range(EPOCHS):

     print(f'Epoch {epoch + 1}/{EPOCHS}')

     print('-' * 10)

     train_acc, train_loss = train_epoch(

       model,

       train_data_loader,

       loss_fn,

       optimizer,

       device,

       scheduler,

       len(df_train)

     )

     print(f'Train loss {train_loss} accuracy {train_acc}')

     val_acc, val_loss = eval_model(

       model,

       val_data_loader,

       loss_fn,

       device,

       len(df_val)

     )

     print(f'Val   loss {val_loss} accuracy {val_acc}')

     print()

     history['train_acc'].append(train_acc)

     history['train_loss'].append(train_loss)

     history['val_acc'].append(val_acc)

     history['val_loss'].append(val_loss)

     if val_acc > best_accuracy:

       torch.save(model.state_dict(), 'best_model_state.bin')

       best_accuracy = val_acc

-- 编辑：我已经打印出 preds 和 target 以及 train 和 val 准确度

【问题讨论】：

标签： pytorch sentiment-analysis bert-language-model huggingface-transformers multiclass-classification

【解决方案1】：

这里是_, preds = torch.max(outputs, dim=1)，你可能想要argmax，而不是max？

打印出preds 和targets 以更好地了解发生了什么。

在打印出 preds 和 targets 后进行编辑。 对于 epoch 4 和 5，preds 与 targets 完全匹配，因此训练准确度应该为 1。我认为问题在于准确度被划分byn_examples，是整个train数据集中的样本数，除以epoch中的样本数。

【讨论】：

您好，感谢您的评论。我试过 argmax 但它抛出错误 ValueError: too many values to unpack (expected 2)。我编辑了我的第一篇文章，打印出预测和目标。请检查。
@hhp 我已经更新了答案。看起来准确度实际上很好，只是计算方式的一个错误。
嗨，Sergii，既然你说我回去看，确实 n_examples 是 len(df_train)。非常感谢您指出。但是，我确实希望我的测试准确度可以更高一些（负标签的最高准确率为 62%）。我的目标是70%左右。我知道实际上我们的数据集标记本身并不是那么准确（很难通过关于不同主题的 2000 多个 cmets 并正确标记它们。客户在提供 cmets 时往往会混入其中。）但是我该如何改进 F1-分数？
@hhp 根据您报告的初始准确性，很明显某处存在错误。现在这是一个完全不同的问题。也许当前的准确性是任务/模型/数据应该预期的。或者可能需要一些超参数调整。
嗨，Sergii，我尝试将随机种子调整为更高的值，它的总体平均准确率达到了 70%。我已经测试了一些句子，它似乎做得很好。感谢您的帮助。