refactor

3 years ago · 1c817c315f
parent c5cffd1069
commit 1c817c315f
14 changed files with 272 additions and 90 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
 dramaoftheday.md
 .idea/
 .venv/
-saved_models/*
+src/DOTD_generator/saved_models/*
-!saved_models/rnn_2epochs.pth
+!src/DOTD_generator/saved_models/rnn_2epochs.pth
 *py[cod]
 .ipynb_checkpoints
 *egg-info
--- a/.python-version
+++ b/.python-version
@ -1 +1 @@
-3.7.3
+3.7.7
--- a/32
+++ b/32
@ -0,0 +1,32 @@
 setup:
 	python -m venv .venv && . .venv/bin/activate
 	pip install --upgrade pip
 	pip install -r requirements.txt
 clean-pyc:
 	find . -name '*.pyc' -exec rm -f {} +
 	find . -name '*.pyo' -exec rm -f {} +
 	find . -name '*~' -exec rm -f {} +
 	find . -name '__pycache__' -exec rm -fr {} +
 clean-test:
 	rm -f .coverage
 	rm -f .coverage.*
 clean: clean-pyc clean-test
 test: clean
 	. .venv/bin/activate && py.test tests --cov=src --cov-report=term-missing --cov-fail-under 95
 mypy:
 	. .venv/bin/activate && mypy src
 lint:
 	. .venv/bin/activate && pylint src -j 4 --reports=y
 docs: FORCE
 	cd docs; . .venv/bin/activate && sphinx-apidoc -o ./source ./src
 	cd docs; . .venv/bin/activate && sphinx-build -b html ./source ./build
 FORCE:
 check: test lint mypy
--- a/markdown_header.txt
+++ b/markdown_header.txt
@ -1,19 +0,0 @@
 +++
 title = "Drama of the Day"
 date = "2021-11-07"
 +++
 This excerpt of a drama, although inspired by Shakespeare, is entirely artifically generated.  
 The language model that was used to produce this text only had prior knowledge about the latin alphabet. All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit.  
 Everyday, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama.
 If you like to know more about this, [click here](/DOTD_generator).
 ---
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,34 @@
 astroid==2.9.3
 attrs==21.4.0
 black==22.1.0
 certifi==2021.10.8
 charset-normalizer==2.0.11
 click==8.0.3
 codecov==2.1.12
 coverage==6.3.1
 idna==3.3
 importlib-metadata==4.10.1
 iniconfig==1.1.1
 isort==5.10.1
 lazy-object-proxy==1.7.1
 mccabe==0.6.1
 mypy==0.931
 mypy-extensions==0.4.3
 packaging==21.3
 pathspec==0.9.0
 platformdirs==2.4.1
 pluggy==1.0.0
 py==1.11.0
 pylint==2.12.2
 pyparsing==3.0.7
 pytest==7.0.0
 pytest-cov==3.0.0
 requests==2.27.1
 toml==0.10.2
 tomli==2.0.0
 torch==1.10.2
 typed-ast==1.5.2
 typing-extensions==4.0.1
 urllib3==1.26.8
 wrapt==1.13.3
 zipp==3.7.0
--- a/src/DOTD_generator/init.py
+++ b/src/DOTD_generator/init.py
@ -1,2 +0,0 @@
 __version__ = "0.1.0"
--- a/src/DOTD_generator/drama_of_the_day.py
+++ b/src/DOTD_generator/drama_of_the_day.py
@ -1,66 +0,0 @@
 #!/usr/bin/env python
 # coding: utf-8
 import torch
 import re
 class RNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size # size of the GRU layers
        self.batch = batch 
        self.layers = layers # how many GRU layers
        self.word_embeds = torch.nn.Embedding(vocab_size, embedding_size) # Embedding layer
        self.gru = torch.nn.GRU(embedding_size, hidden_size, layers, batch_first=True) # GRU layer(s)
        self.output_layer = torch.nn.Linear(hidden_size, vocab_size)
    def forward(self, inputs, hidden):
        x = self.word_embeds(inputs) # transform the input integer into a high dimensional embedding
        output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s)
        output = self.output_layer(output) # compute the logits
        return output, hidden
    def initHidden(self):
        return torch.zeros(self.layers, self.batch, self.hidden_size) 
 def preprocess(text):
    alphabet = sorted(set(text))
    letter_to_int = {let: ind for ind, let in enumerate(alphabet)}
    int_to_letter = {ind: let for ind, let in enumerate(alphabet)}
    letter_ints = [letter_to_int[letter] for letter in text]
    alphabet_size = len(alphabet)
    return int_to_letter, letter_to_int, alphabet_size, letter_ints
 def markdown_header():
    return open("/home/tux/shakespeare_generator/markdown_header.txt", "rt").read()
 text = open("/home/tux/shakespeare_generator/shakespeare.txt", "rt").read()
 init_seq = "TOM:"
 int_to_letter, letter_to_int, alphabet_size, letter_ints = preprocess(text)
 rnn = RNN(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model
 rnn.load_state_dict(torch.load("/home/tux/shakespeare_generator/rnn_2epochs.pth", map_location=torch.device('cpu'))) # load weights
 rnn.eval() # tell model its time to evaluate
 def write_drama(seq, temp=0.7, max_seq_len=1000):
    hidden = rnn.initHidden() 
    input_idx = torch.LongTensor([[letter_to_int[s] for s in seq]]) # input characters to ints
    for i in range(max_seq_len):
        output, hidden = rnn(input_idx, hidden) # predict the logits for the next character
        pred = torch.squeeze(output, 0)[-1]
        pred = pred / temp # apply temperature
        pred_id = torch.distributions.categorical.Categorical(logits=pred).sample() # sample from the distribution
        input_idx = torch.cat((input_idx[:,1:], pred_id.reshape(1,-1)), 1) # predicted character is added to our input
        seq += int_to_letter[pred_id.item()] # add predicted character to sequence
    return seq
 def stylise_drama(drama):
    return re.sub(r"\n", r"<br>\n", drama)
 output_text = markdown_header() + stylise_drama(write_drama(init_seq))
 with open("/home/tux/shakespeare_generator/dramaoftheday.md", "w") as f:
    f.write(output_text)
--- a/src/dotd/init.py
+++ b/src/dotd/init.py
@ -0,0 +1,7 @@
 # coding: utf-8
 from os.path import dirname, abspath
 __version__ = "0.1.0"
 ROOT_DIR = dirname(abspath(__file__))
--- a/src/dotd/data.py
+++ b/src/dotd/data.py
@ -0,0 +1,36 @@
 # coding: utf-8
 import torch
 from torch.utils.data import Dataset
 def get_shakespeare():
    return open("data/shakespeare.txt", "rt").read()
 def preprocess(text: str) -> tuple:
    alphabet = sorted(set(text))
    letter_to_int = {let: ind for ind, let in enumerate(alphabet)}
    int_to_letter = {ind: let for ind, let in enumerate(alphabet)}
    letter_ints = [letter_to_int[letter] for letter in text]
    alphabet_size = len(alphabet)
    return int_to_letter, letter_to_int, alphabet_size, letter_ints
 class ShakespeareDataset(Dataset):
    def __init__(self, seq_len: int):
        _, _, _, self.text = preprocess(get_shakespeare())
        self.x = torch.LongTensor(self.text[:-1])  # get the data
        self.y = torch.LongTensor(self.text[1:])
        self.seq_len = seq_len  # set the sequence length
    def __len__(self) -> int:
        return (
            len(self.text) - self.seq_len - 1
        )  # length of corpora minus sequence length minus shift
    def __getitem__(self, index: int) -> tuple:
        return (
            self.x[index : index + self.seq_len],
            self.x[index : index + self.seq_len],
        )  # return tuple of (sample, label)
--- a/src/dotd/data/markdown_header.txt
+++ b/src/dotd/data/markdown_header.txt
@ -0,0 +1,20 @@
 +++
 title = "Drama of the Day"
 date = "2021-11-07"
 +++
 This excerpt of a drama, although inspired by Shakespeare, is entirely artificially generated.
 The language model that was used to produce this text only had prior knowledge about the latin alphabet.
 All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit.
 Every day, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama.
 If you like to know more about this, [click here](/DOTD_generator).
 ---
--- a/src/dotd/data/shakespeare.txt
+++ b/src/dotd/data/shakespeare.txt
--- a/src/dotd/generate.py
+++ b/src/dotd/generate.py
@ -0,0 +1,59 @@
 # coding: utf-8
 import re
 import torch
 from dotd.model import GRU
 from dotd.data import get_shakespeare, preprocess
 def get_header():
    return open("data/markdown_header.txt", "rt").read()
 def load_model(alphabet_size: int, path: str = "saved_models/rnn_2epochs.pth") -> GRU:
    gru = GRU(alphabet_size, 1024, 256, layers=2, batch=1)  # instantiate model
    gru.load_state_dict(
        torch.load(
            path,
            map_location=torch.device("cpu"),
        )
    )  # load weights
    return gru.eval()
 def make_pretty(text: str) -> str:
    return re.sub(r"\n", r"<br>\n", text)
 def write_drama(seq, temp=0.7, max_seq_len=1000):
    int_to_letter, letter_to_int, alphabet_size, _ = preprocess(get_shakespeare())
    gru = load_model(alphabet_size=alphabet_size)
    hidden = gru.init_hidden()
    input_idx = torch.LongTensor(
        [[letter_to_int[s] for s in seq]]
    )  # input characters to ints
    for i in range(max_seq_len):
        output, hidden = gru(
            input_idx, hidden
        )  # predict the logits for the next character
        pred = torch.squeeze(output, 0)[-1]
        pred = pred / temp  # apply temperature
        pred_id = torch.distributions.categorical.Categorical(
            logits=pred
        ).sample()  # sample from the distribution
        input_idx = torch.cat(
            (input_idx[:, 1:], pred_id.reshape(1, -1)), 1
        )  # predicted character is added to our input
        seq += int_to_letter[pred_id.item()]  # add predicted character to sequence
    return seq
 def main():
    init_seq = "TOM:"
    output_text = get_header() + make_pretty(write_drama(init_seq))
    with open("dramaoftheday.md", "w") as f:
        f.write(output_text)
 if __name__ == "__main__":
    main()
--- a/src/dotd/model.py
+++ b/src/dotd/model.py
@ -0,0 +1,78 @@
 # coding: utf-
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 import numpy as np
 from dotd.data import ShakespeareDataset, preprocess, get_shakespeare
 class GRU(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size  # size of the GRU layers
        self.batch = batch
        self.layers = layers  # how many GRU layers
        self.word_embeds = torch.nn.Embedding(
            vocab_size, embedding_size
        )  # Embedding layer
        self.gru = torch.nn.GRU(
            embedding_size, hidden_size, layers, batch_first=True
        )  # GRU layer(s)
        self.output_layer = torch.nn.Linear(hidden_size, vocab_size)
    def forward(self, inputs, hidden):
        x = self.word_embeds(
            inputs
        )  # transform the input integer into a high dimensional embedding
        output, hidden = self.gru(x, hidden)  # Compute the output of the GRU layer(s)
        output = self.output_layer(output)  # compute the logits
        return output, hidden
    def init_hidden(self):
        return torch.zeros(self.layers, self.batch, self.hidden_size)
 def train_epoch(model, loader, optim, loss, device):
    current_loss = []  # record running loss
    model.to(device)  # put the model on the specified device
    hidden = model.init_hidden().to(device)  # create the hidden state
    model.train()  # tell the model its training time
    for X, y in loader:
        X, y = X.to(device), y.to(
            device
        )  # collect the data and labels from the dataloader and put them on the device
        optim.zero_grad()  # empty the gradients
        output, hidden = model(X, hidden)  # compute the output
        hidden = hidden.detach()  # take the hidden state out of the graph
        batch_loss = loss(output.transpose(1, 2), y)  # compute loss
        batch_loss.backward()  # compute gradients
        optim.step()  # update weights
        current_loss.append(batch_loss.item())  # record loss
    epoch_loss = np.mean(current_loss)
    return epoch_loss
 def train(epochs: int):
    _, _, alphabet_size, _ = preprocess(
        get_shakespeare()
    )  # get amount of characters for one-hot to embedding
    gru = GRU(alphabet_size, 1024, 256, layers=2)  # instantiate model
    loss = torch.nn.CrossEntropyLoss()
    optim = torch.optim.Adam(gru.parameters())
    device = "cuda" if torch.cuda.is_available() else "cpu"
    loader = DataLoader(
        ShakespeareDataset(seq_len=100),
        batch_size=32,
        shuffle=True,
        num_workers=2,
        drop_last=True,
    )
    for e in tqdm(range(epochs)):
        l = train_epoch(gru, loader, optim, loss, device)
        print(f"Epoch: {e}, Loss: {l}")
    torch.save(gru.state_dict(), "saved_models/gru_{}epochs.pth".format(epochs + 1))
 if __name__ == "__main__":
    train(20)
--- a/src/dotd/saved_models/rnn_2epochs.pth
+++ b/src/dotd/saved_models/rnn_2epochs.pth