|
- from operator import itemgetter
- from copy import deepcopy
- import heapq
- import numpy
- import numpy as np
- import torch
- import torch.optim as optim
- from allennlp.common.util import lazy_groups_of
- from allennlp.data.iterators import BucketIterator
- from allennlp.nn.util import move_to_device
- from allennlp.modules.text_field_embedders import TextFieldEmbedder
- from sklearn.metrics import accuracy_score, f1_score
- from allennlp.data.fields import LabelField
- from allennlp.data.instance import Instance
- from allennlp.data.fields import TextField
- from sklearn.metrics import roc_auc_score
- from sklearn.metrics import roc_curve
- import scipy.stats
-
- def get_embedding_weight(model):
- """
- Extracts and returns the token embedding weight matrix from the model.
- """
- for module in model.modules():
- if isinstance(module, TextFieldEmbedder):
- for embed in module._token_embedders.keys():
- embedding_weight = module._token_embedders[embed].weight.cpu().detach()
-
- return embedding_weight
-
- # hook used in add_hooks()
- extracted_grads = []
- def extract_grad_hook(module, grad_in, grad_out):
- extracted_grads.append(grad_out[0])
-
- def add_hooks(model):
- """
- Finds the token embedding matrix on the model and registers a hook onto it.
- When loss.backward() is called, extracted_grads list will be filled with
- the gradients w.r.t. the token embeddings
- """
- for module in model.modules():
- if isinstance(module, TextFieldEmbedder):
- for embed in module._token_embedders.keys():
- module._token_embedders[embed].weight.requires_grad = True
- module.register_backward_hook(extract_grad_hook)
-
- def embeddings_batch(model, batch, snli=False):
- batch = move_to_device(batch[0], cuda_device=0)
- embeddings = None
- mask = None
- if snli:
- embeddings, mask = model(batch['premise'], batch['hypothesis'], batch['label'], use_raw = True, return_embedding=True)
- else:
- embeddings, mask = model(batch['tokens'], batch['label'], use_raw = True, return_embedding=True)
- return embeddings, mask
-
- def predict_batch_embedding(model, embeddings, mask, snli=False):
- preds = model((embeddings, mask), None, use_raw=False, return_prob=True)
- return preds
-
- def evaluate_batch(model, batch, trigger_token_ids=None, snli=False, surrogate=False):
- """
- Takes a batch of classification examples (SNLI or SST), and runs them through the model.
- If trigger_token_ids is not None, then it will append the tokens to the input.
- This funtion is used to get the model's accuracy and/or the loss with/without the trigger.
- """
- batch = move_to_device(batch[0], cuda_device=0)
- output_dict = {}
- if trigger_token_ids is None:
- if snli:
- output_dict = model(batch['premise'], batch['hypothesis'], batch['label'], batch['adv'])
- else:
- output_dict = model(batch['tokens'], batch['label'], batch['adv'])
- return output_dict
- else:
- trigger_sequence_tensor = torch.LongTensor(deepcopy(trigger_token_ids))
- trigger_sequence_tensor = trigger_sequence_tensor.repeat(len(batch['label']), 1).cuda()
- if snli:
- original_tokens = batch['hypothesis']['tokens'].clone()
- batch['hypothesis']['tokens'] = torch.cat((trigger_sequence_tensor, original_tokens), 0)
- output_dict = model(batch['premise'], batch['hypothesis'], batch['label'], batch['adv'])
- batch['hypothesis']['tokens'] = original_tokens
- else:
- original_tokens = batch['tokens']['tokens'].clone()
- batch['tokens']['tokens'] = torch.cat((trigger_sequence_tensor, original_tokens), 1)
- if not surrogate:
- output_dict = model(batch['tokens'], batch['label'], batch['adv'])
- else:
- surrogate_label = torch.tensor([1]*len(batch['label'])).cuda()
- output_dict = model(batch['tokens'], surrogate_label, batch['adv'])
- batch['tokens']['tokens'] = original_tokens
-
- return output_dict
-
- def evaluate_batch_custom(trainer, main_model, model, batch, trigger_token_ids=None, snli=False):
- batch0 = move_to_device(batch[0], cuda_device=0)
- original_tokens = batch0['tokens']['tokens'].clone()
-
- if trigger_token_ids is not None:
- trigger_sequence_tensor = torch.LongTensor(deepcopy(trigger_token_ids))
- trigger_sequence_tensor = trigger_sequence_tensor.repeat(len(batch0['label']), 1).cuda()
- original_tokens = batch0['tokens']['tokens'].clone()
- batch0['tokens']['tokens'] = torch.cat((trigger_sequence_tensor, original_tokens), 1)
-
- embeddings, mask = embeddings_batch(main_model, [batch0])
- X = embeddings.view(embeddings.size()[0], -1)
- loss, grads = trainer.loss_grads_input(X, batch[0]['label'])
-
- batch0['tokens']['tokens'] = original_tokens
-
- return loss, grads
-
- def get_average_grad_custom(trainer, main_model, model, batch, trigger_token_ids, target_label=None, snli=False):
-
- # prepend triggers to the batch
- original_labels = batch[0]['label'].clone()
- if target_label is not None:
- # set the labels equal to the target (backprop from the target class, not model prediction)
- batch[0]['label'] = int(target_label) * torch.ones_like(batch[0]['label']).cuda()
-
- loss, grads = evaluate_batch_custom(trainer, main_model, model, batch, trigger_token_ids=trigger_token_ids, snli=snli)
-
- batch[0]['label'] = original_labels # reset labels
-
- averaged_grad = torch.sum(grads, dim=0).reshape(-1, 300) #length*300
- averaged_grad = averaged_grad[0:len(trigger_token_ids)] # return just trigger grads
-
- return averaged_grad
-
- def get_average_grad(model, batch, trigger_token_ids, target_label=None, snli=False):
- """
- Computes the average gradient w.r.t. the trigger tokens when prepended to every example
- in the batch. If target_label is set, that is used as the ground-truth label.
- """
- # create an dummy optimizer for backprop
- optimizer = optim.Adam(model.parameters())
- optimizer.zero_grad()
-
- # prepend triggers to the batch
- original_labels = batch[0]['label'].clone()
- if target_label is not None:
- # set the labels equal to the target (backprop from the target class, not model prediction)
- batch[0]['label'] = int(target_label) * torch.ones_like(batch[0]['label']).cuda()
- global extracted_grads
- extracted_grads = [] # clear existing stored grads
- loss = evaluate_batch(model, batch, trigger_token_ids, snli)['loss']
- loss.backward()
- # index 0 has the hypothesis grads for SNLI. For SST, the list is of size 1.
- grads = extracted_grads[0].cpu()
- batch[0]['label'] = original_labels # reset labels
-
- # average grad across batch size, result only makes sense for trigger tokens at the front
- averaged_grad = torch.sum(grads, dim=0) #batch_size*300
- averaged_grad = averaged_grad[0:len(trigger_token_ids)] # return just trigger grads
- # averaged_grad = averaged_grad[-len(trigger_token_ids):] # return just trigger grads
- return averaged_grad
-
-
- def get_accuracy_detection(model, dev_dataset, vocab, trigger_token_ids=None, snli=False, get_threshold=False, verbose=False):
- """
- When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
- triggers prepended for the whole dev_dataset.
- """
- model.get_metrics(reset=True)
- model.eval() # model should be in eval() already, but just in case
-
- clean_dataset = []
- adv_dataset = []
- for data in dev_dataset:
- fields = {}
- fields['tokens'] = data['tokens']
- fields['label'] = LabelField(0, skip_indexing=True)
- fields['adv'] = LabelField(0, skip_indexing=True)
- clean_dataset.append(Instance(fields))
-
- fields = {}
- fields['tokens'] = data['tokens']
- fields['label'] = LabelField(1, skip_indexing=True)
- fields['adv'] = LabelField(1, skip_indexing=True)
- adv_dataset.append(Instance(fields))
-
- if snli:
- iterator = BucketIterator(batch_size=128, sorting_keys=[("premise", "num_tokens")])
- else:
- iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")])
- iterator.index_with(vocab)
-
- print_string = []
- for idx in trigger_token_ids:
- print_string += [vocab.get_token_from_index(idx)]
-
- logits =[]
- labels = []
- for batch in lazy_groups_of(iterator(clean_dataset, num_epochs=1, shuffle=False), group_size=1):
- output = evaluate_batch(model, batch, None, snli)
- logits.append(output['logits'].detach().cpu().numpy())
- labels.append(output['labels'].detach().cpu().numpy())
-
- for batch in lazy_groups_of(iterator(adv_dataset, num_epochs=1, shuffle=False), group_size=1):
- output = evaluate_batch(model, batch, trigger_token_ids, snli)
- logits.append(output['logits'].detach().cpu().numpy())
- labels.append(output['labels'].detach().cpu().numpy())
-
- logits = np.concatenate(logits, 0)
- labels = np.concatenate(labels, 0)
-
- num = int(len(labels)/2)
- if not model.use_cosine:
- if len(logits.shape) > 1:
- preds_int = np.argmax(logits, 1)
- preds_int[preds_int > 0] = 1
- scores = preds_int
- else:
- if "use" in str(type(model)).lower() and model.threshold:
- best_threshold = model.threshold
- preds_int = logits <= best_threshold
- scores = preds_int
- print(logits)
- else:
- fpr, tpr, thresholds = roc_curve(labels, logits)
- gmeans = np.sqrt(tpr * (1-fpr))
- idx = np.argmax(gmeans)
- best_threshold = thresholds[idx]
- print("threshold", best_threshold)
- print("Median", np.median(logits))
- print("TPR:", tpr[idx])
- print("FPR:", fpr[idx])
- preds_int = logits >= best_threshold
- scores = logits
-
- else:
- preds_int = (logits >= 0.5) # need to find threshold
- scores = logits
-
- acc = accuracy_score(labels, preds_int)
- auc = roc_auc_score(labels, scores)
-
- remain_clean = np.where(preds_int[:num] == 0)[0]
- remain_adv = np.where(preds_int[num:] == 0)[0]
-
- return acc, auc, remain_clean, remain_adv
-
- def get_accuracy(model, dev_dataset, vocab, trigger_token_ids=None, snli=False, reset_metric=True):
- """
- When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
- triggers prepended for the whole dev_dataset.
- """
- if reset_metric:
- model.get_metrics(reset=True)
- model.eval() # model should be in eval() already, but just in case
- if snli:
- iterator = BucketIterator(batch_size=128, sorting_keys=[("premise", "num_tokens")])
- else:
- iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")])
- iterator.index_with(vocab)
-
- logits =[]
- labels = []
- print_string = []
- if trigger_token_ids:
- for idx in trigger_token_ids:
- print_string += [vocab.get_token_from_index(idx)]
-
- for batch in lazy_groups_of(iterator(dev_dataset, num_epochs=1, shuffle=False), group_size=1):
- output = evaluate_batch(model, batch, trigger_token_ids, snli)
- logits.append(output['logits'].detach().cpu().numpy())
- labels.append(output['labels'].detach().cpu().numpy())
-
- logits = np.concatenate(logits, 0)
- labels = np.concatenate(labels, 0)
- preds_int = np.argmax(logits, 1)
- success_idx = np.where(labels != preds_int)[0]
- acc = accuracy_score(labels, preds_int)
- if len(np.unique(labels)) > 1:
- f1_weighted = f1_score(labels, preds_int, average="weighted")
- try:
- f1 =f1_score(labels, preds_int)
- except:
- f1 = f1_weighted
- else:
- f1 = 'N/A'
- f1_weighted = 'N/A'
-
- try:
- auc = roc_auc_score(labels, preds_int)
- auc = "{:.4f}".format(auc)
- except:
- auc = "N/A"
-
- return acc, auc, f1, f1_weighted, success_idx
-
- def get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids, snli=False, beam_size=1, surrogate=None):
- """"
- Given the list of candidate trigger token ids (of number of trigger words by number of candidates
- per word), it finds the best new candidate trigger.
- This performs beam search in a left to right fashion.
- """
- # first round, no beams, just get the loss for each of the candidates in index 0.
- # (indices 1-end are just the old trigger)
- loss_per_candidate = get_loss_per_candidate(0, model, batch, trigger_token_ids,
- cand_trigger_token_ids, snli, surrogate=surrogate)
- # maximize the loss
- top_candidates = heapq.nlargest(beam_size, loss_per_candidate, key=itemgetter(1))
-
- # top_candidates now contains beam_size trigger sequences, each with a different 0th token
- for idx in range(1, len(trigger_token_ids)): # for all trigger tokens, skipping the 0th (we did it above)
- loss_per_candidate = []
- for cand, _ in top_candidates: # for all the beams, try all the candidates at idx
- loss_per_candidate.extend(get_loss_per_candidate(idx, model, batch, cand,
- cand_trigger_token_ids, snli, surrogate=surrogate))
- top_candidates = heapq.nlargest(beam_size, loss_per_candidate, key=itemgetter(1))
- # print(top_candidates)
- return max(top_candidates, key=itemgetter(1))[0]
-
- def get_best_candidates_custom(trainer, main_model, model, batch, trigger_token_ids, cand_trigger_token_ids, snli=False, beam_size=1):
- loss_per_candidate = get_loss_per_candidate_custom(0, trainer, main_model, model, batch, trigger_token_ids,
- cand_trigger_token_ids, snli)
- # maximize the loss
- top_candidates = heapq.nlargest(beam_size, loss_per_candidate, key=itemgetter(1))
-
- # top_candidates now contains beam_size trigger sequences, each with a different 0th token
- for idx in range(1, len(trigger_token_ids)): # for all trigger tokens, skipping the 0th (we did it above)
- loss_per_candidate = []
- for cand, _ in top_candidates: # for all the beams, try all the candidates at idx
- loss_per_candidate.extend(get_loss_per_candidate_custom(idx, trainer, main_model, model, batch, cand,
- cand_trigger_token_ids, snli))
-
- top_candidates = heapq.nlargest(beam_size, loss_per_candidate, key=itemgetter(1))
- return max(top_candidates, key=itemgetter(1))[0]
-
- def get_loss_per_candidate_custom(index, trainer, main_model, model, batch, trigger_token_ids, cand_trigger_token_ids, snli=False):
- if isinstance(cand_trigger_token_ids[0], (numpy.int64, int)):
- print("Only 1 candidate for index detected, not searching")
- return trigger_token_ids
- loss_per_candidate = []
- # loss for the trigger without trying the candidates
-
- curr_loss, _ = evaluate_batch_custom(trainer, main_model, model, batch, trigger_token_ids=trigger_token_ids, snli=snli)
- curr_loss = curr_loss.cpu().detach().numpy()
-
- loss_per_candidate.append((deepcopy(trigger_token_ids), curr_loss))
- for cand_id in range(len(cand_trigger_token_ids[0])):
- trigger_token_ids_one_replaced = deepcopy(trigger_token_ids) # copy trigger
- trigger_token_ids_one_replaced[index] = cand_trigger_token_ids[index][cand_id] # replace one token
- loss, _ = evaluate_batch_custom(trainer, main_model, model, batch, trigger_token_ids=trigger_token_ids_one_replaced, snli=snli)
- loss = loss.cpu().detach().numpy()
- loss_per_candidate.append((deepcopy(trigger_token_ids_one_replaced), loss))
- return loss_per_candidate
-
-
- def get_loss_per_candidate(index, model, batch, trigger_token_ids, cand_trigger_token_ids, snli=False, surrogate=None):
- """
- For a particular index, the function tries all of the candidate tokens for that index.
- The function returns a list containing the candidate triggers it tried, along with their loss.
- """
- if isinstance(cand_trigger_token_ids[0], (numpy.int64, int)):
- print("Only 1 candidate for index detected, not searching")
- return trigger_token_ids
- model.get_metrics(reset=True)
- loss_per_candidate = []
- # loss for the trigger without trying the candidates
-
- def cal_loss(trigger_token_ids):
- curr_loss = evaluate_batch(model, batch, trigger_token_ids, snli)['loss'].cpu().detach().numpy()
- detect_loss = 0 # we want the best candidate to have the max loss value
- if surrogate:
- detect_loss = evaluate_batch(surrogate, batch, trigger_token_ids, snli, surrogate=True)['loss'].cpu().detach().numpy()
- detector_pred = evaluate_batch(surrogate, batch, trigger_token_ids, snli, surrogate=True)['logits'].cpu().detach().numpy()
- detector_pred = np.argmax(detector_pred, 1)
- if (detector_pred != 0).mean() > 0.5:
- detect_loss = -9999
- curr_loss = curr_loss + detect_loss # we want the best candidate to have the max detect_loss
- return curr_loss
-
- curr_loss = cal_loss(trigger_token_ids)
- loss_per_candidate.append((deepcopy(trigger_token_ids), curr_loss))
- for cand_id in range(len(cand_trigger_token_ids[0])):
- trigger_token_ids_one_replaced = deepcopy(trigger_token_ids) # copy trigger
- trigger_token_ids_one_replaced[index] = cand_trigger_token_ids[index][cand_id] # replace one token
- loss = cal_loss(trigger_token_ids_one_replaced)
- loss_per_candidate.append((deepcopy(trigger_token_ids_one_replaced), loss))
- return loss_per_candidate
|