# Problems 08

This exercise consists purely of bonus points.

Due date: 27.01.20 11:59pm <br>
Email Address: statistik@cl.uni-heidelberg.de AND staniek@cl.uni-heidelberg.de<br>
Subject: [statistik] Problems08 Nachname, Vorname <br>
Format: .zip file containing the notebook and all auxiliary files. Answers in English or German. Mathematical formulas in $\LaTeX$.

In [None]:
label_names = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}

def read_data(filename):
    """
    Reads an annotated corpus into a list.
    
    Args:
    filename -- str -- The name of the corpus file.
    
    Returns:
    documents -- list -- A list of all documents with their corresponding label.
    """
    with open(filename, 'r', encoding="utf-8") as data:
        documents = [line.split(",") for line in data.read().split("\n") if line != ""]
    return documents
    

def get_vectors(data):
    """
    Cleans up the document representations. 
    ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'] -> [[5.1, 3.5, 1.4, 0.2], 0]
    
    Args:
    data -- list -- A list of all documents with their corresponding label.
    
    Returns:
    vectors -- list -- A list of all document vectors with their corresponding label.
    """
    return [[[float(x) for x in example[:-1]], label_names[example[-1]]] for example in data]

def split_train_test(data):
    training_data = []
    test_data = []

    for i in range(len(data)):
        if i % 5 == 0:
            test_data.append(data[i])
        else:
            training_data.append(data[i])
            
    return training_data, test_data


# Exercise 1

https://stackabuse.com/introduction-to-pytorch-for-classification/

Please read the tutorial. 
Annotate the following 2 code boxes with comments

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn


class ClassificationModel(nn.Module):
    def __init__(self, input_size=4, class_labels=3, hidden_size=3):
        super(ClassificationModel, self).__init__()
        self.linear = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, class_labels)
        
    def forward(self, x):
        return F.log_softmax(self.linear2(F.relu(self.linear(x))))

In [None]:

iris = read_data("iris.data")
data = get_vectors(iris)
training_data, test_data = split_train_test(data)

model=ClassificationModel()

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

for epoch in range(1000):
    total_loss = 0
    for vector, label in training_data:
        optimizer.zero_grad()
        vector_tensor = torch.tensor([vector])
        label_tensor = torch.tensor([label])
        prediction = model(vector_tensor)
        loss=loss_function(prediction, label_tensor)
        loss.backward()
        
        optimizer.step()
        total_loss+=loss
    print(total_loss)
correct, total = 0, 0
for vector, label in training_data:   
    vector_tensor = torch.tensor([vector])
    label_tensor = torch.tensor([label])
    prediction = model(vector_tensor)
    if (prediction.argmax() == label_tensor).any().item():
        correct+=1
    total+=1
print(correct/total)
    

# Exercise 2

Draw the neural network with the corresponding edge weights between all nodes. Extract the weights from the Neural Network.

# Exercise 3

In the box below, modify the neural network to consist of 2 Hidden Layers.

In [None]:
class ClassificationModel(nn.Module):
    def __init__(self, input_size=4, class_labels=3, hidden_size=5):
        super(ClassificationModel, self).__init__()
        self.linear = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, class_labels)
        
    def forward(self, x):
        return F.log_softmax(self.linear2(F.relu(self.linear(x))))

# Exercise 4

In the code below, complete the forward pass of the LanguageModel.
Read through the Tutorial: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

Use a textdocument that you have available to input into the network.

Also, comment the code.


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

class LanguageModel(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.0):
        super(LanguageModel, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers 
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)

        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x).view(batch_size, x.size(1),-1)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        #TODO: implement the rest here
        return out, hidden
    
    def init_hidden(self, batch_size):
        device=torch.device("cpu")
        weight = next(self.parameters()).data
        #print(self.parameters)
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        #print(len(hidden))
        return hidden
    


In [None]:
texts = [x for x in open("DATEINAME.TXT").read().split("\n") if x!=""]
complete_text = " ".join(texts)

char2id = {x:i for i,x in enumerate(set(complete_text)|set(["^", "$"]))}
id2char = {i:x for x,i in char2id.items()}
model = LanguageModel(len(char2id), len(char2id), 100, 200, 3)
loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for i in range(150):
    print(i)
    totalloss = 0
    for j,text in enumerate(texts[:200]):
        in_tensor = torch.tensor([[char2id[x] for x in "^"+text]])
        out_tensor = torch.tensor([[char2id[x] for x in text+"$"]])
        hidden = model.init_hidden(1)
        pred , hidden= model(in_tensor, hidden)
        maximums=torch.argmax(pred, dim=(-1))
        out="".join(id2char[x.item()] for x in maximums)
        loss=loss_function(pred, out_tensor.view(-1))
        totalloss+=loss.data
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(totalloss)
    


In [None]:
#TORCH.DISTRIBUTIONS.CATEGORICAL
from torch.distributions.categorical import Categorical
start = "^"
total = ""
hidden = model.init_hidden(1)
for i in range(100):
    in_tensor = torch.tensor([[char2id[start]]])
    pred , hidden = model(in_tensor, hidden)
    m=Categorical(probs=torch.exp(pred))
    sample=m.sample()
    total+=id2char[sample.item()]
    start = id2char[sample.item()]
    if start == "$":
        break
print(total)