initial commit

4 years ago · 2056b207f6
commit 2056b207f6
5 changed files with 40082 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+dramaoftheday.md
+.venv/
--- a/drama_of_the_day.py
+++ b/drama_of_the_day.py
@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# coding: utf-8
+import torch
+import re
+
+class RNN(torch.nn.Module):
+    def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2):
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size # size of the GRU layers
+        self.batch = batch 
+        self.layers = layers # how many GRU layers
+        self.word_embeds = torch.nn.Embedding(vocab_size, embedding_size) # Embedding layer
+        self.gru = torch.nn.GRU(embedding_size, hidden_size, layers, batch_first=True) # GRU layer(s)
+        self.output_layer = torch.nn.Linear(hidden_size, vocab_size)
+        
+    def forward(self, inputs, hidden):
+        x = self.word_embeds(inputs) # transform the input integer into a high dimensional embedding
+        output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s)
+        output = self.output_layer(output) # compute the logits
+        return output, hidden
+
+    def initHidden(self):
+        return torch.zeros(self.layers, self.batch, self.hidden_size) 
+
+def preprocess(text):
+    alphabet = sorted(set(text))
+    letter_to_int = {let: ind for ind, let in enumerate(alphabet)}
+    int_to_letter = {ind: let for ind, let in enumerate(alphabet)}
+    letter_ints = [letter_to_int[letter] for letter in text]
+    alphabet_size = len(alphabet)
+    return int_to_letter, letter_to_int, alphabet_size, letter_ints
+
+
+def markdown_header():
+    return open("/home/tux/shakespeare_generator/markdown_header.txt", "rt").read()
+
+text = open("/home/tux/shakespeare_generator/shakespeare.txt", "rt").read()
+init_seq = "TOM:"
+int_to_letter, letter_to_int, alphabet_size, letter_ints = preprocess(text)
+rnn = RNN(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model
+rnn.load_state_dict(torch.load("/home/tux/shakespeare_generator/rnn_2epochs.pth", map_location=torch.device('cpu'))) # load weights
+rnn.eval() # tell model its time to evaluate
+
+def write_drama(seq, temp=0.7, max_seq_len=1000):
+    hidden = rnn.initHidden() 
+    input_idx = torch.LongTensor([[letter_to_int[s] for s in seq]]) # input characters to ints
+    for i in range(max_seq_len):
+        output, hidden = rnn(input_idx, hidden) # predict the logits for the next character
+        pred = torch.squeeze(output, 0)[-1]
+        pred = pred / temp # apply temperature
+        pred_id = torch.distributions.categorical.Categorical(logits=pred).sample() # sample from the distribution
+        input_idx = torch.cat((input_idx[:,1:], pred_id.reshape(1,-1)), 1) # predicted character is added to our input
+        seq += int_to_letter[pred_id.item()] # add predicted character to sequence
+    return seq
+
+def stylise_drama(drama):
+    return re.sub(r"\n", r"<br>\n", drama)
+
+output_text = markdown_header() + stylise_drama(write_drama(init_seq))
+
+with open("/home/tux/shakespeare_generator/dramaoftheday.md", "w") as f:
+    f.write(output_text)
+
+
+
+
--- a/markdown_header.txt
+++ b/markdown_header.txt
@ -0,0 +1,14 @@
+++
+title = "Drama of the Day"
+date = "2021-11-07"
+++
+
+
+This excerpt of a drama, although inspired by Shakespeare, is entirely artifically generated.  
+The language model that was used to produce this text only had prior knowledge about the latin alphabet. This means that all the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit.  
+Everyday, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama.
+
+
+---
+
+
--- a/rnn_2epochs.pth
+++ b/rnn_2epochs.pth
--- a/shakespeare.txt
+++ b/shakespeare.txt