Assignments/Assignment4/tagger.py

# -*- coding: utf-8 -*-
"""NLP-ass4.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15TnxrSGp88y0ijmcCga1xPdxBJJ0oF4U

##Notebook Setup
"""

#from google.colab import drive
#drive.mount('/content/drive')
#drive_path = '/content/drive/MyDrive/NLP/Ex4/'
#train_file_path = "en-ud-train.upos.tsv"
#test_file_path = "en-ud-dev.upos.tsv"
#embeddings_file_path = "glove.6B.100d.txt"

"""## Environment Setup"""

"""
nlp, assignment 4, 2021

In this assignment you will implement a Hidden Markov model and an LSTM model
to predict the part of speech sequence for a given sentence.
(Adapted from Nathan Schneider)

"""

import torch
import torch.nn as nn
from torchtext import data, vocab as torch_vocab

from torchtext.legacy import data as data_l
import torch.optim as optim
from math import log, isfinite
from collections import Counter

import sys, os, time, platform, nltk, random
import numpy as np


# With this line you don't need to worry about the HW  -- GPU or CPU
# GPU cuda cores will be used if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# You can call use_seed with other seeds or None (for complete randomization)
# but DO NOT change the default value.
def use_seed(seed = 2512021):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.set_deterministic(True)
    #torch.backends.cudnn.deterministic = True

# utility functions to read the corpus
def who_am_i(): #this is not a class method
    """Returns a dictionary with your name, id number and email. keys=['name', 'id','email']
        Make sure you return your own info!
    """
    #edit the dictionary to have your own details
    return {'name': 'Yiftach Savransky', 'id': '312141369', 'email': 'yiftachs@post.bgu.ac.il'}

def read_annotated_sentence(f):
    line = f.readline()
    if not line:
        return None
    sentence = []
    while line and (line != "\n"):
        line = line.strip()
        word, tag = line.split("\t", 2)
        sentence.append( (word, tag) )
        line = f.readline()
    return sentence

def load_annotated_corpus(filename):
    sentences = []
    with open(filename, 'r', encoding='utf-8') as f:
        sentence = read_annotated_sentence(f)
        while sentence:
            sentences.append(sentence)
            sentence = read_annotated_sentence(f)
    return sentences

"""## Learn params and Baseline Tagset"""

START = "<DUMMY_START_TAG>"
END = "<DUMMY_END_TAG>"
UNK = "<UNKNOWN>"

allTagCounts = Counter()
# use Counters inside these
perWordTagCounts = {}
transitionCounts = {}
emissionCounts = {}
# log probability distributions: do NOT use Counters inside these because
# missing Counter entries default to 0, not log(0)
A = {} #transisions probabilities
B = {} #emmissions probabilities

def get_all_tag_counts(annotated_corpus):
  allTagCounts = Counter(annotated_token[1] for annotated_sentence in annotated_corpus for annotated_token in annotated_sentence)
  # allTagCounts does not include pseudocounts, dummy tags and unknowns.
  return allTagCounts

def get_pairs_counts(annotated_corpus):
  pairsCounts = Counter(annotated_token for annotated_sentence in annotated_corpus for annotated_token in annotated_sentence)
  return pairsCounts

def get_per_word_tag_counts(annotated_corpus):
  pairsCounts = get_pairs_counts(annotated_corpus)
  perWordTagdict = {}
  perWordTagCounts = {}

  for (token, tag), token_tag_count in pairsCounts.items():
    if not token in perWordTagdict:
      perWordTagdict[token] = {}
    perWordTagdict[token][tag] = token_tag_count
  for token_type, tags_dict in perWordTagdict.items():
    perWordTagCounts[token_type] = Counter(tags_dict)
  return Counter(perWordTagCounts)

def get_emission_counts(annotated_corpus, all_tag_counts):
  emissionCounts = {}
  for tag in all_tag_counts:
    emissionCounts[tag] = Counter(annotated_token[0] for annotated_sentence in annotated_corpus for annotated_token in annotated_sentence if annotated_token[1]==tag)
  return Counter(emissionCounts)

def get_transition_counts(annotated_corpus, all_tags):
  #Call the new list a different name
  corpus_tags = [[START]+[annotated_token[1] for annotated_token in annotated_sentence]+[END] for annotated_sentence in annotated_corpus]
  #create all the pairs of tag_i, tag_i_+_1
  corpus_tags_pairs = [[sentence_tags[tag_index],sentence_tags[tag_index+1] ] for sentence_tags in corpus_tags for tag_index in range(len(sentence_tags)-1)]

  #counter(dict) of {tag_i: {tag_i_+_1:count}}  
  transitionCounts = {}
  for tag in all_tags:
    transitionCounts[tag] = Counter(tags_pair[1] for tags_pair in corpus_tags_pairs if tags_pair[0]==tag)
  return Counter(transitionCounts)

def get_A(transitionCounts, tags):
  A={}
  for tag_i, next_tags_transition_count in transitionCounts.items():
    A[tag_i] = {}
    tag_i_count = max(sum(next_tags_transition_count.values()), 1) #for END token
    tag_i_count += len(next_tags_transition_count) #for smoothing
    
    for tag_i_plus_1 in tags:
      # with smoothing
      count_tags_co_occurrence = 1
      if tag_i_plus_1 in next_tags_transition_count:
        count_tags_co_occurrence = count_tags_co_occurrence + next_tags_transition_count[tag_i_plus_1]

      # A[tag_i][tag_i_plus_1] = count_tags_co_occurrence/tag_i_count
      A[tag_i][tag_i_plus_1] = log(count_tags_co_occurrence/tag_i_count)
  return A

def get_B(emissionCounts, vocab):
  B={}
  for tag_i, next_token_emission_count in emissionCounts.items():
    B[tag_i] = {}
    tag_i_count = max(sum(next_token_emission_count.values()), 1) 
    tag_i_count += len(vocab) #for smoothing
    
    for next_token in vocab:
      # with smoothing
      count_tag_token_co_occurrence = 1
      if next_token in next_token_emission_count:
        count_tag_token_co_occurrence = count_tag_token_co_occurrence + next_token_emission_count[next_token]

      B[tag_i][next_token] = log(count_tag_token_co_occurrence/tag_i_count)
  return B

def learn_params(tagged_sentences):
  """Populates and returns the allTagCounts, perWordTagCounts, transitionCounts,
    and emissionCounts data-structures.
    allTagCounts and perWordTagCounts should be used for baseline tagging and
    should not include pseudocounts, dummy tags and unknowns.
    The transisionCounts and emmisionCounts
    should be computed with pseudo tags and shoud be smoothed.
    A and B should be the log-probability of the normalized counts, based on
    transisionCounts and emmisionCounts

    Args:
      tagged_sentences: a list of tagged sentences, each tagged sentence is a
        list of pairs (w,t), as retunred by load_annotated_corpus().

    Return:
      [allTagCounts,perWordTagCounts,transitionCounts,emissionCounts,A,B] (a list)
  """
  # START = "<DUMMY_START_TAG>"
  # END = "<DUMMY_END_TAG>"
  # UNK = "<UNKNOWN>"
  vocab = {annotated_token[0] for annotated_sentence in tagged_sentences for annotated_token in annotated_sentence}

  allTagCounts = get_all_tag_counts(tagged_sentences)
  tags = list(allTagCounts.keys())+[START, END]

  perWordTagCounts = get_per_word_tag_counts(tagged_sentences)
  emissionCounts = get_emission_counts(tagged_sentences, allTagCounts)
  transitionCounts = get_transition_counts(tagged_sentences, tags)
  A = get_A(transitionCounts, tags)
  B = get_B(emissionCounts, vocab)
  return [allTagCounts,perWordTagCounts,transitionCounts,emissionCounts,A,B]

def baseline_tag_sentence(sentence, perWordTagCounts, allTagCounts):
  """Returns a list of pairs (w,t) where each w corresponds to a word
  (same index) in the input sentence. Each word is tagged by the tag most
  frequently associated with it. OOV words are tagged by sampling from the
  distribution of all tags.

  Args:
      sentence (list): a list of tokens (the sentence to tag)
      perWordTagCounts (Counter): tags per word as specified in learn_params()
      allTagCounts (Counter): tag counts, as specified in learn_params()

      Return:
      list: list of pairs
  """

  tagged_sentence = []
  for token in sentence:
    if token in perWordTagCounts:
      #Each word is tagged by the tag most frequently associated with it.
      tag = max(perWordTagCounts[token], key=perWordTagCounts[token].get)
    else:
      #OOV words are tagged by sampling from the distribution of all tags.
      tag = random.choices(list(allTagCounts.keys()), weights=list(allTagCounts.values()))[0]
    tagged_sentence.append((token, tag))

  return tagged_sentence

"""##Hidden Markov Model"""

#===========================================
#       POS tagging with HMM
#===========================================

def retrace_wrapper(end_item):
    """Wrapper for the recursive retrace function
    """
    tags_sequence = retrace(end_item[1])
    return tags_sequence
def retrace(item):
    """Returns a list of tags (retracing the sequence with the highest probability,
        reversing it and returning the list). The list correspond to the
        list of words in the sentence (same indices).
    """
    if item[1] is not None: #stopping criteria
      prev_seq = retrace(item[1])
      return prev_seq + [item[0]]
    else:
      return [item[0]] #return tag

def hmm_tag_sentence(sentence, A, B):
    """Returns a list of pairs (w,t) where each w corresponds to a word
    (same index) in the input sentence. Tagging is done with the Viterby
    algorithm.

    Args:
        sentence (list): a list of tokens (the sentence to tag)
        A (dict): The HMM Transition probabilities
        B (dict): tthe HMM emmission probabilities.

    Return:
        list: list of pairs
    """
    last_item = viterbi(sentence, A,B)
    tags_sequence = retrace_wrapper(last_item)
    tagged_sentence = [(sentence[i], tags_sequence[i]) for i in range(len(sentence))]

    return tagged_sentence

def viterbi(sentence, A,B):
  """Creates the Viterbi matrix, column by column. Each column is a list of
  tuples representing cells. Each cell ("item") is a tupple (t,r,p), were
  t is the tag being scored at the current position,
  r is a reference to the corresponding best item from the previous position,
  and p is a log probabilityof the sequence so far).

  The function returns the END item, from which it is possible to
  trace back to the beginning of the sentence.

  Args:
      sentence (list): a list of tokens (the sentence to tag)
      A (dict): The HMM Transition probabilities
      B (dict): tthe HMM emmission probabilities.

  Return:
      obj: the last item, tagged with END. should allow backtraking.

      """
  # Hint 1: For efficiency reasons - for words seen in training there is no
  #      need to consider all tags in the tagset, but only tags seen with that
  #      word. For OOV you have to consider all tags.
  # Hint 2: start with a dummy item  with the START tag (what would it log-prob be?).
  #         current list = [ the dummy item ]
  # Hint 3: end the sequence with a dummy: the highest-scoring item with the tag END

  V = []

  tags = list(B.keys())
  #initialize
  V.append([])
  for next_tag in tags:
    next_tag_log_prob = A[START][next_tag]
    if sentence[0] in B[next_tag]:
      p = next_tag_log_prob + B[next_tag][sentence[0]]
    else: #OOV words get the min log-prob value of B[next_tag].values()
      p = next_tag_log_prob + min(B[next_tag].values())
    triplete = (next_tag, None, p)
    V[0].append(triplete)


  for token_index in range(1, len(sentence)):
    V.append([])
    token = sentence[token_index]
    for curr_tag_index, curr_tag in enumerate(tags):
      if token in B[curr_tag]:
        B_curr_token = B[curr_tag][token]
      else: #OOV words get the min log-prob value of B[next_tag].values()
        B_curr_token = min(B[curr_tag].values())
      
      viterbi_scores = []
      for previous_tag_index, previous_tag in enumerate(tags):
        viterbi_previous_log_prob = V[token_index-1][previous_tag_index][2]
        A_prev_curr = A[previous_tag][curr_tag]
        viterbi_scores.append(viterbi_previous_log_prob+A_prev_curr+B_curr_token)

      p = max(viterbi_scores)
      prev_tag_index = np.argmax(viterbi_scores)
      prev_tag_pointer = V[token_index-1][prev_tag_index]

      triplete = (curr_tag, prev_tag_pointer, p)
      V[token_index].append(triplete)

  #Last item
  last_item_index = np.argmax([x[2] for x in V[-1]])
  last_item = (END, V[-1][last_item_index], V[-1][last_item_index][2])
  return last_item


#a suggestion for a helper function. Not an API requirement
# def predict_next_best(word, tag, predecessor_list):
# """Returns a new item (tupple)
# """


def joint_prob(sentence, A, B):
  """Returns the joint probability of the given sequence of words and tags under
    the HMM model.

    Args:
        sentence (pair): a sequence of pairs (w,t) to compute.
        A (dict): The HMM Transition probabilities
        B (dict): the HMM emmission probabilities.
    """
  p = 0   # joint log prob. of words and tags

  first_token, first_tag = sentence[0]
  first_tag_log_prob = A[START][first_tag]
  if first_token in B[first_tag]:
    p = first_tag_log_prob + B[first_tag][first_token]
  else: #OOV words get the min log-prob value of B[next_tag].values()
    p = first_tag_log_prob + min(B[first_tag].values())
  
  curr_tag = first_tag
  for pair_index in range(1, len(sentence)):
    previous_tag = curr_tag
    curr_token, curr_tag = sentence[pair_index]
    if curr_token in B[curr_tag]:
      B_curr_token = B[curr_tag][curr_token]
    else: #OOV words get the min log-prob value of B[next_tag].values()
      B_curr_token = min(B[curr_tag].values())
    A_prev_curr = A[previous_tag][curr_tag]
    p = p + A_prev_curr + B_curr_token

  assert isfinite(p) and p<0  # Should be negative. Think why!... because of the log...
  return p

"""##BiLSTM"""

#===========================================
#       POS tagging with BiLSTM
#===========================================

""" You are required to support two types of bi-LSTM:
    1. a vanilla biLSTM in which the input layer is based on simple word embeddings
    2. a case-based BiLSTM in which input vectors combine a 3-dim binary vector
        encoding case information, see
        https://arxiv.org/pdf/1510.06168.pdf
"""

# Suggestions and tips, not part of the required API
#
#  1. You can use PyTorch torch.nn module to define your LSTM, see:
#     https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
#  2. You can have the BLSTM tagger model(s) implemented in a dedicated class
#     (this could be a subclass of torch.nn.Module)
#  3. Think about padding.
#  4. Consider using dropout layers
#  5. Think about the way you implement the input representation
#  6. Consider using different unit types (LSTM, GRU,LeRU)

class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim, pad_idx):
        super(BiLSTMModel, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.output_dim = output_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            bidirectional=True,
                            # batch_first=True, 
                            num_layers=num_layers)

        self.linear_relu_stack = nn.Sequential(
            nn.Linear(int(hidden_dim*2), int(hidden_dim*2/2)),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(int(hidden_dim*2/2), output_dim),
            nn.Softmax()
        )

    def forward(self, x):

        embedded = self.embedding(x)

        lstm_out, _ = self.lstm(embedded)

        out = self.linear_relu_stack(lstm_out)
        return out

class BiLSTMCaseModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim, pad_idx):
        super(BiLSTMCaseModel, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.output_dim = output_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim+1, 
                            hidden_dim, 
                            bidirectional=True,
                            # batch_first=True, 
                            num_layers=num_layers)

        self.linear_relu_stack = nn.Sequential(
            nn.Linear(int(hidden_dim*2), int(hidden_dim*2/2)),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(int(hidden_dim*2/2), output_dim),
            nn.Softmax()
        )

    def forward(self, x_text, x_case):
        embedded = self.embedding(x_text)

        if len(x_case.size()) > 1:
          x_case = x_case.reshape(x_case.size()[0], x_case.size()[1], 1)
        else:
          x_case = x_case.reshape(x_case.size()[0],1, 1)
        
        concat = torch.cat((embedded,x_case), 2)

        lstm_out, _ = self.lstm(concat)

        out = self.linear_relu_stack(lstm_out)
        return out


def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

def load_pretrained_embeddings(path, vocab=None):
  """ Returns an object with the the pretrained vectors, loaded from the
      file at the specified path. The file format is the same as
      https://www.kaggle.com/danielwillgeorge/glove6b100dtxt
      You can also access the vectors at:
        https://www.dropbox.com/s/qxak38ybjom696y/glove.6B.100d.txt?dl=0
        (for efficiency (time and memory) - load only the vectors you need)
      The format of the vectors object is not specified as it will be used
      internaly in your code, so you can use the datastructure of your choice.

  Args:
      path (str): full path to the embeddings file
      vocab (list): a list of words to have embeddings for. Defaults to None.

  """
  #vectors = torchtext.vocab.Vectors(embeddings_file_path)
  vocab_lower = [w.lower() for w in vocab]
  vectors = {}
  with open(path,'r') as f:
      for line in f:
          split_line = line.split()
          word = split_line[0]
          if word in vocab:
            embedding = np.array(split_line[1:], dtype=np.float64)
            vectors[word] = embedding
          elif word in vocab_lower:
            embedding = np.array(split_line[1:], dtype=np.float64)
            vectors[word] = embedding
  print(f"{len(vectors)} words loaded!")
  return vectors


def prepare_embeddings(embeddings_file_path, vocab):
  """
  Loads the relevant embeddings and prepares them for the usage of the rnn model
  """
  embeds = load_pretrained_embeddings(embeddings_file_path, vocab)

  stoi = {}
  embeds_vecs = []
  for key, value in embeds.items():
    i = len(stoi)
    stoi[key] = i
    embeds_vecs.append(torch.tensor(value))
  return stoi, embeds_vecs

def prepare_data(annotated_corpus, vocab, max_vocab_size, min_freq, batch_size, pretrained_embeddings_fn, embedding_dim, stoi=None, embeds_vecs=None):
  """
  arrenges the training data for the training process
  """
  train_tokens = [[annotated_pair[0] for annotated_pair in annotated_sentence] for annotated_sentence in annotated_corpus]
  train_tags = [[annotated_pair[1] for annotated_pair in annotated_sentence] for annotated_sentence in annotated_corpus]
  
  src = ('text', data_l.Field(batch_first=False, lower=True)) 
  trg = ('labels', data_l.Field(is_target=True))

  fields=[src,trg]
  train_exp = [data_l.Example.fromlist(data=[train_tokens[i],train_tags[i]], fields=fields) for i in range(len(train_tokens))]

  train_dataset = data_l.Dataset(examples = train_exp, fields=fields)

  train_iter = data_l.BucketIterator(dataset=train_dataset, batch_size=batch_size, device=device, sort_key=lambda x: len(x.text))
  # build the vocabulary
  src[1].build_vocab(train_dataset, max_size=max_vocab_size, min_freq=min_freq)
  trg[1].build_vocab(train_dataset)

  if stoi is None or embeds_vecs is None:
    stoi, embeds_vecs = prepare_embeddings(pretrained_embeddings_fn, vocab)
  src[1].vocab.set_vectors(stoi=stoi, vectors=embeds_vecs, dim=embedding_dim)

  return train_iter, src, trg, stoi, embeds_vecs

def get_case_features(token):
  # returns a three-dimensional binary vector to tell if wi is full lowercase, 
  # full uppercase or leading with a capital letter
  # case_features = [0,0,0]
  case_features = 0
  if token.islower():
    # case_features = [1,0,0]
    case_features = 1
  elif token.isupper():
    # case_features = [0,1,0]
    case_features = 2
  elif token.istitle():
    # case_features = [0,0,1]
    case_features = 3
  return case_features

def prepare_data_case(annotated_corpus, vocab, max_vocab_size, min_freq, batch_size, pretrained_embeddings_fn, embedding_dim, stoi=None, embeds_vecs=None):
  """
  arrenges the training data for the training process
  """
  train_tokens = [[annotated_pair[0] for annotated_pair in annotated_sentence] for annotated_sentence in annotated_corpus]
  train_tokens_case_feat = [[get_case_features(annotated_pair[0]) for annotated_pair in annotated_sentence] for annotated_sentence in annotated_corpus]
  train_tags = [[annotated_pair[1] for annotated_pair in annotated_sentence] for annotated_sentence in annotated_corpus]
  
  src = ('text', data_l.Field(batch_first=False, lower=True)) 
  src_case = ('case', data_l.Field(use_vocab=False, dtype=torch.int, pad_token=0))
  trg = ('labels', data_l.Field(is_target=True))

  fields=[src,src_case,trg]
  train_exp = [data_l.Example.fromlist(data=[train_tokens[i],train_tokens_case_feat[i],train_tags[i]], fields=fields) for i in range(len(train_tokens))]

  train_dataset = data_l.Dataset(examples = train_exp, fields=fields)

  train_iter = data_l.BucketIterator(dataset=train_dataset, batch_size=batch_size, device=device, sort_key=lambda x: len(x.text))
  # build the vocabulary
  src[1].build_vocab(train_dataset, max_size=max_vocab_size, min_freq=min_freq)
  # src_case[1].build_vocab(train_dataset) 
  trg[1].build_vocab(train_dataset)

  if stoi is None or embeds_vecs is None:
    stoi, embeds_vecs = prepare_embeddings(pretrained_embeddings_fn, vocab)
  src[1].vocab.set_vectors(stoi=stoi, vectors=embeds_vecs, dim=embedding_dim)

  return train_iter, src, src_case, trg, stoi, embeds_vecs

"""###Init RNN"""

def initialize_rnn_model(params_d):
  """Returns a dictionary with the objects and parameters needed to run/train_rnn
      the lstm model. The LSTM is initialized based on the specified parameters.
      thr returned dict is may have other or additional fields.

  Args:
      params_d (dict): a dictionary of parameters specifying the model. The dict
                      should include (at least) the following keys:
                      {'max_vocab_size': max vocabulary size (int),
                      'min_frequency': the occurence threshold to consider (int),
                      'input_rep': 0 for the vanilla and 1 for the case-base (int),
                      'embedding_dimension': embedding vectors size (int),
                      'num_of_layers': number of layers (int),
                      'output_dimension': number of tags in tagset (int),
                      'pretrained_embeddings_fn': str,
                      'data_fn': str
                      }
                      max_vocab_size sets a constraints on the vocab dimention.
                          If the its value is smaller than the number of unique
                          tokens in data_fn, the words to consider are the most
                          frequent words. If max_vocab_size = -1, all words
                          occuring more that min_frequency are considered.
                      min_frequency privides a threshold under which words are
                          not considered at all. (If min_frequency=1 all words
                          up to max_vocab_size are considered;
                          If min_frequency=3, we only consider words that appear
                          at least three times.)
                      input_rep (int): sets the input representation. Values:
                          0 (vanilla), 1 (case-base);
                          <other int>: other models, if you are playful
                      The dictionary can include other keys, if you use them,
                            BUT you shouldn't assume they will be specified by
                            the user, so you should spacify default values.
  Return:
      a dictionary with the at least the following key-value pairs:
                                      {'lstm': torch.nn.Module object,
                                      input_rep: [0|1]}
      #Hint: you may consider adding the embeddings and the vocabulary
      #to the returned dict
  """
  annotated_corpus=load_annotated_corpus(params_d['data_fn'])
  vocab = {annotated_token[0] for annotated_sentence in annotated_corpus for annotated_token in annotated_sentence}
  
  batch_size=16

  max_vocab_size = params_d["max_vocab_size"] if params_d["max_vocab_size"] > 0 else None

  if params_d["input_rep"] == 0:
    train_iter, src, trg, stoi, embeds_vecs = prepare_data(annotated_corpus, vocab, max_vocab_size, params_d["min_frequency"], batch_size=batch_size, pretrained_embeddings_fn=params_d["pretrained_embeddings_fn"], embedding_dim=params_d["embedding_dimension"])

    # Model initialization
    input_dim = len(src[1].vocab)
    pad_idx = src[1].vocab.stoi[src[1].pad_token]
    hidden_dim = 128
    output_dim=params_d["output_dimension"] if params_d["output_dimension"]==len(trg[1].vocab.itos) else len(trg[1].vocab.itos)

    lstm_model = BiLSTMModel(input_dim=input_dim,embedding_dim=params_d["embedding_dimension"], hidden_dim=hidden_dim, num_layers=params_d["num_of_layers"], output_dim=output_dim, pad_idx=pad_idx)
  
  elif params_d["input_rep"] == 1:
    train_iter, src, src_case, trg, stoi, embeds_vecs = prepare_data_case(annotated_corpus, vocab, max_vocab_size, params_d["min_frequency"], batch_size=batch_size, pretrained_embeddings_fn=params_d["pretrained_embeddings_fn"], embedding_dim=params_d["embedding_dimension"])
    # Model initialization
    input_dim = len(src[1].vocab)
    pad_idx = src[1].vocab.stoi[src[1].pad_token]
    hidden_dim = 128
    output_dim=params_d["output_dimension"] if params_d["output_dimension"]==len(trg[1].vocab.itos) else len(trg[1].vocab.itos)

    lstm_model = BiLSTMCaseModel(input_dim=input_dim,embedding_dim=params_d["embedding_dimension"], hidden_dim=hidden_dim, num_layers=params_d["num_of_layers"], output_dim=output_dim, pad_idx=pad_idx)
  

  lstm_model.apply(init_weights)
  pretrained_embeddings = src[1].vocab.vectors
  lstm_model.embedding.weight.data.copy_(pretrained_embeddings)
  lstm_model.embedding.weight.data[pad_idx] = torch.zeros(params_d["embedding_dimension"])
  print(lstm_model)

  model = {
    'lstm': lstm_model,
    'input_rep': params_d["input_rep"],
    'vocab': vocab,
    'max_vocab_size': max_vocab_size,
    'min_freq': params_d["min_frequency"],
    'pretrained_embeddings_fn': params_d["pretrained_embeddings_fn"],
    'stoi': stoi, 
    'embeds_vecs': embeds_vecs,
    'embedding_dim': params_d["embedding_dimension"],
    'epochs': 80, 
    'learning_rate': 0.15,
    'batch_size': batch_size
  }

  return model


def train_rnn(model, train_data, val_data = None):
    """Trains the BiLSTM model on the specified data.

    Args:
        model (dict): the model dict as returned by initialize_rnn_model()
        train_data (list): a list of annotated sentences in the format returned
                            by load_annotated_corpus()
        val_data (list): a list of annotated sentences in the format returned
                            by load_annotated_corpus() to be used for validation.
                            Defaults to None
        input_rep (int): sets the input representation. Defaults to 0 (vanilla),
                         1: case-base; <other int>: other models, if you are playful
    """
    #Tips:
    # 1. you have to specify an optimizer
    # 2. you have to specify the loss function and the stopping criteria
    # 3. consider using batching
    # 4. some of the above could be implemented in helper functions (not part of
    #    the required API)
    if model["input_rep"] == 0:
      train_iter, src, trg, stoi, embeds_vecs = prepare_data(train_data, model['vocab'], model['max_vocab_size'], model["min_freq"], model["batch_size"], pretrained_embeddings_fn=model["pretrained_embeddings_fn"], embedding_dim=model["embedding_dim"], stoi=model["stoi"], embeds_vecs=model["embeds_vecs"])
      model["src"] = src
      model["trg"] = trg
    elif model["input_rep"] == 1:
      train_iter, src, src_case, trg, stoi, embeds_vecs = prepare_data_case(train_data, model['vocab'], model['max_vocab_size'], model["min_freq"], model["batch_size"], pretrained_embeddings_fn=model["pretrained_embeddings_fn"], embedding_dim=model["embedding_dim"], stoi=model["stoi"], embeds_vecs=model["embeds_vecs"])
      model["src"] = src
      model["src_case"] = src_case
      model["trg"] = trg


    optimizer = optim.SGD(model["lstm"].parameters(), lr=model['learning_rate'])
    tag_pad_idx = trg[1].vocab.stoi[trg[1].pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10.0, gamma=0.8)

    model["lstm"] = model["lstm"].to(device)
    criterion = criterion.to(device)

    train(model=model["lstm"], opt=optimizer, crit=criterion, scheduler=scheduler, train_iter=train_iter, epochs=model['epochs'], tag_pad_idx=tag_pad_idx, input_rep=model["input_rep"])


def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

def pad_case(case_vecs, pad=None):
  if pad is None:
    pad = 0
  target_pad_length = max(len(vec) for vec in case_vecs)
  padded = []
  for vec in case_vecs:
    while len(vec) < target_pad_length:
      vec.append(pad)
    padded.append(vec)
  padded_tensor = torch.Tensor(padded)
  padded_tensor = padded_tensor.reshape(padded_tensor.size()[1], padded_tensor.size()[0], padded_tensor.size()[2])
  return padded_tensor

def train_epoch(model, opt, crit, train_iter, tag_pad_idx, input_rep):
    model.train(mode=True) #setting the model to training mode
    epoch_loss = 0
    epoch_acc = 0
    for batch in train_iter:
        if input_rep == 0:
          predicted_label = model(batch.text)
        elif input_rep == 1:
          predicted_label = model(batch.text, batch.case)
          
        labels = batch.labels  
        opt.zero_grad()

        predicted_label = predicted_label.view(-1, predicted_label.shape[-1])
        labels = labels.view(-1)

        loss = crit(predicted_label, labels)
        
        loss.backward()
        opt.step()

        acc = categorical_accuracy(predicted_label, labels, tag_pad_idx)
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(train_iter), epoch_acc / len(train_iter)

def train(model, opt, crit, scheduler, train_iter, epochs, tag_pad_idx, input_rep):
  total_accuracy = None

  for epoch in range(1, epochs + 1):
      epoch_start_time = time.time()
      train_loss, train_acc = train_epoch(model=model, opt=opt, crit=crit, train_iter=train_iter, tag_pad_idx=tag_pad_idx, input_rep=input_rep)
      scheduler.step()
      
      print('-' * 59)
      print('| end of epoch {:3d} | time: {:5.2f}s '.format(epoch,time.time() - epoch_start_time))
                                            # 'valid accuracy {:8.3f} ',accuracy_val))
      print(f"\|train accuracy {train_acc} \| train loss {train_loss} \| last_lr: {str(scheduler.get_last_lr())}") #auc_val: {auc_val}, 


def rnn_tag_sentence(sentence, model):
  """ Returns a list of pairs (w,t) where each w corresponds to a word
      (same index) in the input sentence and t is the predicted tag.

  Args:
      sentence (list): a list of tokens (the sentence to tag)
      model (dict):  a dictionary with the trained BiLSTM model and all that is needed
                      to tag a sentence.

  Return:
      list: list of pairs
  """
  model["lstm"].eval()

  if model["src"][1].lower:
        tokens = [t.lower() for t in sentence] 

  numericalized_tokens = [model["src"][1].vocab.stoi[t] for t in tokens]

  token_tensor = torch.LongTensor(numericalized_tokens)
  
  token_tensor = token_tensor.unsqueeze(-1).to(device)
        
  if model["input_rep"] == 0:
    predictions = model["lstm"](token_tensor)
  elif model["input_rep"] == 1:
    case_feat = torch.tensor([get_case_features(t) for t in sentence]).to(device)
    predictions = model["lstm"](token_tensor,case_feat)
  
  top_predictions = predictions.argmax(-1)
  
  predicted_tags = [model["trg"][1].vocab.itos[t.item()] for t in top_predictions]
  
  tagged_sentence = [(sentence[i],predicted_tags[i]) for i in range(len(sentence))]
  return tagged_sentence


def get_best_performing_model_params():
  """Returns a disctionary specifying the parameters of your best performing
      BiLSTM model.
      IMPORTANT: this is a *hard coded* dictionary that will be used to create
      a model and train a model by calling
              initialize_rnn_model() and train_lstm()
  """
  param_dict = {'max_vocab_size': -1,# max vocabulary size (int),
    'min_frequency': 4,# the occurence threshold to consider (int),
    'input_rep': 1,# 0 for the vanilla and 1 for the case-base (int),
    'embedding_dimension': 100,# embedding vectors size (int),
    'num_of_layers': 2,# number of layers (int),
    'output_dimension': 19,# number of tags in tagset (int), !!!INCLUDE UNKOWN and Pad!!!
    'pretrained_embeddings_fn': "glove.6B.100d.txt",# str
    'data_fn': "en-ud-train.upos.tsv",# str 
  }
  return param_dict

#===========================================================
#       Wrapper function (tagging with a specified model)
#===========================================================

def tag_sentence(sentence, model):
    """Returns a list of pairs (w,t) where pair corresponds to a word (same index) in
    the input sentence. Tagging is done with the specified model.

    Args:
        sentence (list): a list of tokens (the sentence to tag)
        model (dict): a dictionary where key is the model name and the value is
           an ordered list of the parameters of the trained model (baseline, HMM)
           or the model isteld and the input_rep flag (LSTMs).

        Models that must be supported (you can add more):
        1. baseline: {'baseline': [perWordTagCounts, allTagCounts]}
        2. HMM: {'hmm': [A,B]}
        3. Vanilla BiLSTM: {'blstm':[model_dict]}
        4. BiLSTM+case: {'cblstm': [model_dict]}
        5. (NOT REQUIRED: you can add other variations, agumenting the input
            with further subword information, with character-level word embedding etc.)

        The parameters for the baseline model are:
        perWordTagCounts (Counter): tags per word as specified in learn_params()
        allTagCounts (Counter): tag counts, as specified in learn_params()

        The parameters for the HMM are:
        A (dict): The HMM Transition probabilities
        B (dict): tthe HMM emmission probabilities.

        Parameters for an LSTM: the model dictionary (allows tagging the given sentence)


    Return:
        list: list of pairs
    """
    if list(model.keys())[0]=='baseline':
        return baseline_tag_sentence(sentence, list(model.values())[0][0], list(model.values())[0][1])
    if list(model.keys())[0]=='hmm':
        return hmm_tag_sentence(sentence, list(model.values())[0][0], list(model.values())[0][1])
    if list(model.keys())[0] == 'blstm':
        return rnn_tag_sentence(sentence, list(model.values())[0])
    if list(model.keys())[0] == 'cblstm':
        return rnn_tag_sentence(sentence, list(model.values())[0])

def count_correct(gold_sentence, pred_sentence):
  """Return the total number of correctly predicted tags,the total number of
  correcttly predicted tags for oov words and the number of oov words in the
  given sentence.

  Args:
      gold_sentence (list): list of pairs, assume to be gold labels
      pred_sentence (list): list of pairs, tags are predicted by tagger

  """
  assert len(gold_sentence)==len(pred_sentence)
  print("gold_sentence", gold_sentence)
  print("pred_sentence", pred_sentence)

  vocab = perWordTagCounts.keys()
  OOV = sum([0 if pair[0] in vocab else 1 for pair in gold_sentence])

  correct = 0
  correctOOV = 0

  for i in range(len(gold_sentence)):
    if gold_sentence[i][1] == pred_sentence[i][1]:
      correct+=1
      if gold_sentence[i][0] not in vocab:
        correctOOV+=1
    

  return correct, correctOOV, OOV