From 1c817c315f806840b36eacdda34bed674fa85880 Mon Sep 17 00:00:00 2001 From: Tom Weber Date: Mon, 7 Feb 2022 16:33:34 +0100 Subject: [PATCH] refactor --- .gitignore | 7 +- .python-version | 2 +- Makefile | 32 +++++++ markdown_header.txt | 19 ----- requirements.txt | 34 ++++++++ src/DOTD_generator/__init__.py | 2 - src/DOTD_generator/drama_of_the_day.py | 66 --------------- src/dotd/__init__.py | 7 ++ src/dotd/data.py | 36 ++++++++ src/dotd/data/markdown_header.txt | 20 +++++ {data => src/dotd/data}/shakespeare.txt | 0 src/dotd/generate.py | 59 +++++++++++++ src/dotd/model.py | 78 ++++++++++++++++++ .../dotd/saved_models}/rnn_2epochs.pth | Bin 14 files changed, 272 insertions(+), 90 deletions(-) create mode 100644 Makefile delete mode 100644 markdown_header.txt create mode 100644 requirements.txt delete mode 100644 src/DOTD_generator/__init__.py delete mode 100644 src/DOTD_generator/drama_of_the_day.py create mode 100644 src/dotd/__init__.py create mode 100644 src/dotd/data.py create mode 100644 src/dotd/data/markdown_header.txt rename {data => src/dotd/data}/shakespeare.txt (100%) create mode 100644 src/dotd/generate.py create mode 100644 src/dotd/model.py rename {saved_models => src/dotd/saved_models}/rnn_2epochs.pth (100%) diff --git a/.gitignore b/.gitignore index 3546198..c72639b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ dramaoftheday.md .idea/ .venv/ -saved_models/* -!saved_models/rnn_2epochs.pth +src/DOTD_generator/saved_models/* +!src/DOTD_generator/saved_models/rnn_2epochs.pth +*py[cod] +.ipynb_checkpoints +*egg-info diff --git a/.python-version b/.python-version index c1e43e6..d2577d9 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.7.3 +3.7.7 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d8d9d76 --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +setup: + python -m venv .venv && . .venv/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + +clean-pyc: + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +clean-test: + rm -f .coverage + rm -f .coverage.* + +clean: clean-pyc clean-test + +test: clean + . .venv/bin/activate && py.test tests --cov=src --cov-report=term-missing --cov-fail-under 95 + +mypy: + . .venv/bin/activate && mypy src + +lint: + . .venv/bin/activate && pylint src -j 4 --reports=y + +docs: FORCE + cd docs; . .venv/bin/activate && sphinx-apidoc -o ./source ./src + cd docs; . .venv/bin/activate && sphinx-build -b html ./source ./build +FORCE: + +check: test lint mypy diff --git a/markdown_header.txt b/markdown_header.txt deleted file mode 100644 index 2260b9c..0000000 --- a/markdown_header.txt +++ /dev/null @@ -1,19 +0,0 @@ -+++ -title = "Drama of the Day" -date = "2021-11-07" -+++ - - -This excerpt of a drama, although inspired by Shakespeare, is entirely artifically generated. -The language model that was used to produce this text only had prior knowledge about the latin alphabet. All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit. - - -Everyday, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama. - - -If you like to know more about this, [click here](/DOTD_generator). - - ---- - - diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a59b2ca --- /dev/null +++ b/requirements.txt @@ -0,0 +1,34 @@ +astroid==2.9.3 +attrs==21.4.0 +black==22.1.0 +certifi==2021.10.8 +charset-normalizer==2.0.11 +click==8.0.3 +codecov==2.1.12 +coverage==6.3.1 +idna==3.3 +importlib-metadata==4.10.1 +iniconfig==1.1.1 +isort==5.10.1 +lazy-object-proxy==1.7.1 +mccabe==0.6.1 +mypy==0.931 +mypy-extensions==0.4.3 +packaging==21.3 +pathspec==0.9.0 +platformdirs==2.4.1 +pluggy==1.0.0 +py==1.11.0 +pylint==2.12.2 +pyparsing==3.0.7 +pytest==7.0.0 +pytest-cov==3.0.0 +requests==2.27.1 +toml==0.10.2 +tomli==2.0.0 +torch==1.10.2 +typed-ast==1.5.2 +typing-extensions==4.0.1 +urllib3==1.26.8 +wrapt==1.13.3 +zipp==3.7.0 diff --git a/src/DOTD_generator/__init__.py b/src/DOTD_generator/__init__.py deleted file mode 100644 index f38884b..0000000 --- a/src/DOTD_generator/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__version__ = "0.1.0" - diff --git a/src/DOTD_generator/drama_of_the_day.py b/src/DOTD_generator/drama_of_the_day.py deleted file mode 100644 index 09769fc..0000000 --- a/src/DOTD_generator/drama_of_the_day.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -import torch -import re - -class RNN(torch.nn.Module): - def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2): - super(RNN, self).__init__() - self.hidden_size = hidden_size # size of the GRU layers - self.batch = batch - self.layers = layers # how many GRU layers - self.word_embeds = torch.nn.Embedding(vocab_size, embedding_size) # Embedding layer - self.gru = torch.nn.GRU(embedding_size, hidden_size, layers, batch_first=True) # GRU layer(s) - self.output_layer = torch.nn.Linear(hidden_size, vocab_size) - - def forward(self, inputs, hidden): - x = self.word_embeds(inputs) # transform the input integer into a high dimensional embedding - output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s) - output = self.output_layer(output) # compute the logits - return output, hidden - - def initHidden(self): - return torch.zeros(self.layers, self.batch, self.hidden_size) - -def preprocess(text): - alphabet = sorted(set(text)) - letter_to_int = {let: ind for ind, let in enumerate(alphabet)} - int_to_letter = {ind: let for ind, let in enumerate(alphabet)} - letter_ints = [letter_to_int[letter] for letter in text] - alphabet_size = len(alphabet) - return int_to_letter, letter_to_int, alphabet_size, letter_ints - - -def markdown_header(): - return open("/home/tux/shakespeare_generator/markdown_header.txt", "rt").read() - -text = open("/home/tux/shakespeare_generator/shakespeare.txt", "rt").read() -init_seq = "TOM:" -int_to_letter, letter_to_int, alphabet_size, letter_ints = preprocess(text) -rnn = RNN(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model -rnn.load_state_dict(torch.load("/home/tux/shakespeare_generator/rnn_2epochs.pth", map_location=torch.device('cpu'))) # load weights -rnn.eval() # tell model its time to evaluate - -def write_drama(seq, temp=0.7, max_seq_len=1000): - hidden = rnn.initHidden() - input_idx = torch.LongTensor([[letter_to_int[s] for s in seq]]) # input characters to ints - for i in range(max_seq_len): - output, hidden = rnn(input_idx, hidden) # predict the logits for the next character - pred = torch.squeeze(output, 0)[-1] - pred = pred / temp # apply temperature - pred_id = torch.distributions.categorical.Categorical(logits=pred).sample() # sample from the distribution - input_idx = torch.cat((input_idx[:,1:], pred_id.reshape(1,-1)), 1) # predicted character is added to our input - seq += int_to_letter[pred_id.item()] # add predicted character to sequence - return seq - -def stylise_drama(drama): - return re.sub(r"\n", r"
\n", drama) - -output_text = markdown_header() + stylise_drama(write_drama(init_seq)) - -with open("/home/tux/shakespeare_generator/dramaoftheday.md", "w") as f: - f.write(output_text) - - - - diff --git a/src/dotd/__init__.py b/src/dotd/__init__.py new file mode 100644 index 0000000..245c8fa --- /dev/null +++ b/src/dotd/__init__.py @@ -0,0 +1,7 @@ +# coding: utf-8 + +from os.path import dirname, abspath + +__version__ = "0.1.0" + +ROOT_DIR = dirname(abspath(__file__)) \ No newline at end of file diff --git a/src/dotd/data.py b/src/dotd/data.py new file mode 100644 index 0000000..54384ac --- /dev/null +++ b/src/dotd/data.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +import torch +from torch.utils.data import Dataset + + +def get_shakespeare(): + return open("data/shakespeare.txt", "rt").read() + + +def preprocess(text: str) -> tuple: + alphabet = sorted(set(text)) + letter_to_int = {let: ind for ind, let in enumerate(alphabet)} + int_to_letter = {ind: let for ind, let in enumerate(alphabet)} + letter_ints = [letter_to_int[letter] for letter in text] + alphabet_size = len(alphabet) + return int_to_letter, letter_to_int, alphabet_size, letter_ints + + +class ShakespeareDataset(Dataset): + def __init__(self, seq_len: int): + _, _, _, self.text = preprocess(get_shakespeare()) + self.x = torch.LongTensor(self.text[:-1]) # get the data + self.y = torch.LongTensor(self.text[1:]) + self.seq_len = seq_len # set the sequence length + + def __len__(self) -> int: + return ( + len(self.text) - self.seq_len - 1 + ) # length of corpora minus sequence length minus shift + + def __getitem__(self, index: int) -> tuple: + return ( + self.x[index : index + self.seq_len], + self.x[index : index + self.seq_len], + ) # return tuple of (sample, label) diff --git a/src/dotd/data/markdown_header.txt b/src/dotd/data/markdown_header.txt new file mode 100644 index 0000000..a096913 --- /dev/null +++ b/src/dotd/data/markdown_header.txt @@ -0,0 +1,20 @@ ++++ +title = "Drama of the Day" +date = "2021-11-07" ++++ + + +This excerpt of a drama, although inspired by Shakespeare, is entirely artificially generated. +The language model that was used to produce this text only had prior knowledge about the latin alphabet. +All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit. + + +Every day, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama. + + +If you like to know more about this, [click here](/DOTD_generator). + + +--- + + diff --git a/data/shakespeare.txt b/src/dotd/data/shakespeare.txt similarity index 100% rename from data/shakespeare.txt rename to src/dotd/data/shakespeare.txt diff --git a/src/dotd/generate.py b/src/dotd/generate.py new file mode 100644 index 0000000..595bd6b --- /dev/null +++ b/src/dotd/generate.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +import re +import torch +from dotd.model import GRU +from dotd.data import get_shakespeare, preprocess + + +def get_header(): + return open("data/markdown_header.txt", "rt").read() + + +def load_model(alphabet_size: int, path: str = "saved_models/rnn_2epochs.pth") -> GRU: + gru = GRU(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model + gru.load_state_dict( + torch.load( + path, + map_location=torch.device("cpu"), + ) + ) # load weights + return gru.eval() + + +def make_pretty(text: str) -> str: + return re.sub(r"\n", r"
\n", text) + + +def write_drama(seq, temp=0.7, max_seq_len=1000): + int_to_letter, letter_to_int, alphabet_size, _ = preprocess(get_shakespeare()) + gru = load_model(alphabet_size=alphabet_size) + hidden = gru.init_hidden() + input_idx = torch.LongTensor( + [[letter_to_int[s] for s in seq]] + ) # input characters to ints + for i in range(max_seq_len): + output, hidden = gru( + input_idx, hidden + ) # predict the logits for the next character + pred = torch.squeeze(output, 0)[-1] + pred = pred / temp # apply temperature + pred_id = torch.distributions.categorical.Categorical( + logits=pred + ).sample() # sample from the distribution + input_idx = torch.cat( + (input_idx[:, 1:], pred_id.reshape(1, -1)), 1 + ) # predicted character is added to our input + seq += int_to_letter[pred_id.item()] # add predicted character to sequence + return seq + + +def main(): + init_seq = "TOM:" + output_text = get_header() + make_pretty(write_drama(init_seq)) + with open("dramaoftheday.md", "w") as f: + f.write(output_text) + + +if __name__ == "__main__": + main() diff --git a/src/dotd/model.py b/src/dotd/model.py new file mode 100644 index 0000000..ef6cad0 --- /dev/null +++ b/src/dotd/model.py @@ -0,0 +1,78 @@ +# coding: utf- + +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm +import numpy as np +from dotd.data import ShakespeareDataset, preprocess, get_shakespeare + + +class GRU(torch.nn.Module): + def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2): + super(GRU, self).__init__() + self.hidden_size = hidden_size # size of the GRU layers + self.batch = batch + self.layers = layers # how many GRU layers + self.word_embeds = torch.nn.Embedding( + vocab_size, embedding_size + ) # Embedding layer + self.gru = torch.nn.GRU( + embedding_size, hidden_size, layers, batch_first=True + ) # GRU layer(s) + self.output_layer = torch.nn.Linear(hidden_size, vocab_size) + + def forward(self, inputs, hidden): + x = self.word_embeds( + inputs + ) # transform the input integer into a high dimensional embedding + output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s) + output = self.output_layer(output) # compute the logits + return output, hidden + + def init_hidden(self): + return torch.zeros(self.layers, self.batch, self.hidden_size) + + +def train_epoch(model, loader, optim, loss, device): + current_loss = [] # record running loss + model.to(device) # put the model on the specified device + hidden = model.init_hidden().to(device) # create the hidden state + model.train() # tell the model its training time + for X, y in loader: + X, y = X.to(device), y.to( + device + ) # collect the data and labels from the dataloader and put them on the device + optim.zero_grad() # empty the gradients + output, hidden = model(X, hidden) # compute the output + hidden = hidden.detach() # take the hidden state out of the graph + batch_loss = loss(output.transpose(1, 2), y) # compute loss + batch_loss.backward() # compute gradients + optim.step() # update weights + current_loss.append(batch_loss.item()) # record loss + epoch_loss = np.mean(current_loss) + return epoch_loss + + +def train(epochs: int): + _, _, alphabet_size, _ = preprocess( + get_shakespeare() + ) # get amount of characters for one-hot to embedding + gru = GRU(alphabet_size, 1024, 256, layers=2) # instantiate model + loss = torch.nn.CrossEntropyLoss() + optim = torch.optim.Adam(gru.parameters()) + device = "cuda" if torch.cuda.is_available() else "cpu" + loader = DataLoader( + ShakespeareDataset(seq_len=100), + batch_size=32, + shuffle=True, + num_workers=2, + drop_last=True, + ) + for e in tqdm(range(epochs)): + l = train_epoch(gru, loader, optim, loss, device) + print(f"Epoch: {e}, Loss: {l}") + torch.save(gru.state_dict(), "saved_models/gru_{}epochs.pth".format(epochs + 1)) + + +if __name__ == "__main__": + train(20) diff --git a/saved_models/rnn_2epochs.pth b/src/dotd/saved_models/rnn_2epochs.pth similarity index 100% rename from saved_models/rnn_2epochs.pth rename to src/dotd/saved_models/rnn_2epochs.pth