parent
c5cffd1069
commit
1c817c315f
@ -1,5 +1,8 @@
|
|||||||
dramaoftheday.md
|
dramaoftheday.md
|
||||||
.idea/
|
.idea/
|
||||||
.venv/
|
.venv/
|
||||||
saved_models/*
|
src/DOTD_generator/saved_models/*
|
||||||
!saved_models/rnn_2epochs.pth
|
!src/DOTD_generator/saved_models/rnn_2epochs.pth
|
||||||
|
*py[cod]
|
||||||
|
.ipynb_checkpoints
|
||||||
|
*egg-info
|
||||||
|
@ -1 +1 @@
|
|||||||
3.7.3
|
3.7.7
|
||||||
|
@ -0,0 +1,32 @@
|
|||||||
|
setup:
|
||||||
|
python -m venv .venv && . .venv/bin/activate
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
clean-pyc:
|
||||||
|
find . -name '*.pyc' -exec rm -f {} +
|
||||||
|
find . -name '*.pyo' -exec rm -f {} +
|
||||||
|
find . -name '*~' -exec rm -f {} +
|
||||||
|
find . -name '__pycache__' -exec rm -fr {} +
|
||||||
|
|
||||||
|
clean-test:
|
||||||
|
rm -f .coverage
|
||||||
|
rm -f .coverage.*
|
||||||
|
|
||||||
|
clean: clean-pyc clean-test
|
||||||
|
|
||||||
|
test: clean
|
||||||
|
. .venv/bin/activate && py.test tests --cov=src --cov-report=term-missing --cov-fail-under 95
|
||||||
|
|
||||||
|
mypy:
|
||||||
|
. .venv/bin/activate && mypy src
|
||||||
|
|
||||||
|
lint:
|
||||||
|
. .venv/bin/activate && pylint src -j 4 --reports=y
|
||||||
|
|
||||||
|
docs: FORCE
|
||||||
|
cd docs; . .venv/bin/activate && sphinx-apidoc -o ./source ./src
|
||||||
|
cd docs; . .venv/bin/activate && sphinx-build -b html ./source ./build
|
||||||
|
FORCE:
|
||||||
|
|
||||||
|
check: test lint mypy
|
@ -1,19 +0,0 @@
|
|||||||
+++
|
|
||||||
title = "Drama of the Day"
|
|
||||||
date = "2021-11-07"
|
|
||||||
+++
|
|
||||||
|
|
||||||
|
|
||||||
This excerpt of a drama, although inspired by Shakespeare, is entirely artifically generated.
|
|
||||||
The language model that was used to produce this text only had prior knowledge about the latin alphabet. All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit.
|
|
||||||
|
|
||||||
|
|
||||||
Everyday, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama.
|
|
||||||
|
|
||||||
|
|
||||||
If you like to know more about this, [click here](/DOTD_generator).
|
|
||||||
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
|||||||
|
astroid==2.9.3
|
||||||
|
attrs==21.4.0
|
||||||
|
black==22.1.0
|
||||||
|
certifi==2021.10.8
|
||||||
|
charset-normalizer==2.0.11
|
||||||
|
click==8.0.3
|
||||||
|
codecov==2.1.12
|
||||||
|
coverage==6.3.1
|
||||||
|
idna==3.3
|
||||||
|
importlib-metadata==4.10.1
|
||||||
|
iniconfig==1.1.1
|
||||||
|
isort==5.10.1
|
||||||
|
lazy-object-proxy==1.7.1
|
||||||
|
mccabe==0.6.1
|
||||||
|
mypy==0.931
|
||||||
|
mypy-extensions==0.4.3
|
||||||
|
packaging==21.3
|
||||||
|
pathspec==0.9.0
|
||||||
|
platformdirs==2.4.1
|
||||||
|
pluggy==1.0.0
|
||||||
|
py==1.11.0
|
||||||
|
pylint==2.12.2
|
||||||
|
pyparsing==3.0.7
|
||||||
|
pytest==7.0.0
|
||||||
|
pytest-cov==3.0.0
|
||||||
|
requests==2.27.1
|
||||||
|
toml==0.10.2
|
||||||
|
tomli==2.0.0
|
||||||
|
torch==1.10.2
|
||||||
|
typed-ast==1.5.2
|
||||||
|
typing-extensions==4.0.1
|
||||||
|
urllib3==1.26.8
|
||||||
|
wrapt==1.13.3
|
||||||
|
zipp==3.7.0
|
@ -1,2 +0,0 @@
|
|||||||
__version__ = "0.1.0"
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
import torch
|
|
||||||
import re
|
|
||||||
|
|
||||||
class RNN(torch.nn.Module):
|
|
||||||
def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2):
|
|
||||||
super(RNN, self).__init__()
|
|
||||||
self.hidden_size = hidden_size # size of the GRU layers
|
|
||||||
self.batch = batch
|
|
||||||
self.layers = layers # how many GRU layers
|
|
||||||
self.word_embeds = torch.nn.Embedding(vocab_size, embedding_size) # Embedding layer
|
|
||||||
self.gru = torch.nn.GRU(embedding_size, hidden_size, layers, batch_first=True) # GRU layer(s)
|
|
||||||
self.output_layer = torch.nn.Linear(hidden_size, vocab_size)
|
|
||||||
|
|
||||||
def forward(self, inputs, hidden):
|
|
||||||
x = self.word_embeds(inputs) # transform the input integer into a high dimensional embedding
|
|
||||||
output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s)
|
|
||||||
output = self.output_layer(output) # compute the logits
|
|
||||||
return output, hidden
|
|
||||||
|
|
||||||
def initHidden(self):
|
|
||||||
return torch.zeros(self.layers, self.batch, self.hidden_size)
|
|
||||||
|
|
||||||
def preprocess(text):
|
|
||||||
alphabet = sorted(set(text))
|
|
||||||
letter_to_int = {let: ind for ind, let in enumerate(alphabet)}
|
|
||||||
int_to_letter = {ind: let for ind, let in enumerate(alphabet)}
|
|
||||||
letter_ints = [letter_to_int[letter] for letter in text]
|
|
||||||
alphabet_size = len(alphabet)
|
|
||||||
return int_to_letter, letter_to_int, alphabet_size, letter_ints
|
|
||||||
|
|
||||||
|
|
||||||
def markdown_header():
|
|
||||||
return open("/home/tux/shakespeare_generator/markdown_header.txt", "rt").read()
|
|
||||||
|
|
||||||
text = open("/home/tux/shakespeare_generator/shakespeare.txt", "rt").read()
|
|
||||||
init_seq = "TOM:"
|
|
||||||
int_to_letter, letter_to_int, alphabet_size, letter_ints = preprocess(text)
|
|
||||||
rnn = RNN(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model
|
|
||||||
rnn.load_state_dict(torch.load("/home/tux/shakespeare_generator/rnn_2epochs.pth", map_location=torch.device('cpu'))) # load weights
|
|
||||||
rnn.eval() # tell model its time to evaluate
|
|
||||||
|
|
||||||
def write_drama(seq, temp=0.7, max_seq_len=1000):
|
|
||||||
hidden = rnn.initHidden()
|
|
||||||
input_idx = torch.LongTensor([[letter_to_int[s] for s in seq]]) # input characters to ints
|
|
||||||
for i in range(max_seq_len):
|
|
||||||
output, hidden = rnn(input_idx, hidden) # predict the logits for the next character
|
|
||||||
pred = torch.squeeze(output, 0)[-1]
|
|
||||||
pred = pred / temp # apply temperature
|
|
||||||
pred_id = torch.distributions.categorical.Categorical(logits=pred).sample() # sample from the distribution
|
|
||||||
input_idx = torch.cat((input_idx[:,1:], pred_id.reshape(1,-1)), 1) # predicted character is added to our input
|
|
||||||
seq += int_to_letter[pred_id.item()] # add predicted character to sequence
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def stylise_drama(drama):
|
|
||||||
return re.sub(r"\n", r"<br>\n", drama)
|
|
||||||
|
|
||||||
output_text = markdown_header() + stylise_drama(write_drama(init_seq))
|
|
||||||
|
|
||||||
with open("/home/tux/shakespeare_generator/dramaoftheday.md", "w") as f:
|
|
||||||
f.write(output_text)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
from os.path import dirname, abspath
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
|
||||||
|
ROOT_DIR = dirname(abspath(__file__))
|
@ -0,0 +1,36 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
|
||||||
|
def get_shakespeare():
|
||||||
|
return open("data/shakespeare.txt", "rt").read()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(text: str) -> tuple:
|
||||||
|
alphabet = sorted(set(text))
|
||||||
|
letter_to_int = {let: ind for ind, let in enumerate(alphabet)}
|
||||||
|
int_to_letter = {ind: let for ind, let in enumerate(alphabet)}
|
||||||
|
letter_ints = [letter_to_int[letter] for letter in text]
|
||||||
|
alphabet_size = len(alphabet)
|
||||||
|
return int_to_letter, letter_to_int, alphabet_size, letter_ints
|
||||||
|
|
||||||
|
|
||||||
|
class ShakespeareDataset(Dataset):
|
||||||
|
def __init__(self, seq_len: int):
|
||||||
|
_, _, _, self.text = preprocess(get_shakespeare())
|
||||||
|
self.x = torch.LongTensor(self.text[:-1]) # get the data
|
||||||
|
self.y = torch.LongTensor(self.text[1:])
|
||||||
|
self.seq_len = seq_len # set the sequence length
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return (
|
||||||
|
len(self.text) - self.seq_len - 1
|
||||||
|
) # length of corpora minus sequence length minus shift
|
||||||
|
|
||||||
|
def __getitem__(self, index: int) -> tuple:
|
||||||
|
return (
|
||||||
|
self.x[index : index + self.seq_len],
|
||||||
|
self.x[index : index + self.seq_len],
|
||||||
|
) # return tuple of (sample, label)
|
@ -0,0 +1,20 @@
|
|||||||
|
+++
|
||||||
|
title = "Drama of the Day"
|
||||||
|
date = "2021-11-07"
|
||||||
|
+++
|
||||||
|
|
||||||
|
|
||||||
|
This excerpt of a drama, although inspired by Shakespeare, is entirely artificially generated.
|
||||||
|
The language model that was used to produce this text only had prior knowledge about the latin alphabet.
|
||||||
|
All the (pseudo-)words produced were learnt through imitating Shakespeare. Therefore, you might find quite a few neologisms, in true Shakespearean spirit.
|
||||||
|
|
||||||
|
|
||||||
|
Every day, the model is told to use the prompt **TOM:** and then produces the rest up to 10000 characters, thereby creating a unique new drama.
|
||||||
|
|
||||||
|
|
||||||
|
If you like to know more about this, [click here](/DOTD_generator).
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
|
@ -0,0 +1,59 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import re
|
||||||
|
import torch
|
||||||
|
from dotd.model import GRU
|
||||||
|
from dotd.data import get_shakespeare, preprocess
|
||||||
|
|
||||||
|
|
||||||
|
def get_header():
|
||||||
|
return open("data/markdown_header.txt", "rt").read()
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(alphabet_size: int, path: str = "saved_models/rnn_2epochs.pth") -> GRU:
|
||||||
|
gru = GRU(alphabet_size, 1024, 256, layers=2, batch=1) # instantiate model
|
||||||
|
gru.load_state_dict(
|
||||||
|
torch.load(
|
||||||
|
path,
|
||||||
|
map_location=torch.device("cpu"),
|
||||||
|
)
|
||||||
|
) # load weights
|
||||||
|
return gru.eval()
|
||||||
|
|
||||||
|
|
||||||
|
def make_pretty(text: str) -> str:
|
||||||
|
return re.sub(r"\n", r"<br>\n", text)
|
||||||
|
|
||||||
|
|
||||||
|
def write_drama(seq, temp=0.7, max_seq_len=1000):
|
||||||
|
int_to_letter, letter_to_int, alphabet_size, _ = preprocess(get_shakespeare())
|
||||||
|
gru = load_model(alphabet_size=alphabet_size)
|
||||||
|
hidden = gru.init_hidden()
|
||||||
|
input_idx = torch.LongTensor(
|
||||||
|
[[letter_to_int[s] for s in seq]]
|
||||||
|
) # input characters to ints
|
||||||
|
for i in range(max_seq_len):
|
||||||
|
output, hidden = gru(
|
||||||
|
input_idx, hidden
|
||||||
|
) # predict the logits for the next character
|
||||||
|
pred = torch.squeeze(output, 0)[-1]
|
||||||
|
pred = pred / temp # apply temperature
|
||||||
|
pred_id = torch.distributions.categorical.Categorical(
|
||||||
|
logits=pred
|
||||||
|
).sample() # sample from the distribution
|
||||||
|
input_idx = torch.cat(
|
||||||
|
(input_idx[:, 1:], pred_id.reshape(1, -1)), 1
|
||||||
|
) # predicted character is added to our input
|
||||||
|
seq += int_to_letter[pred_id.item()] # add predicted character to sequence
|
||||||
|
return seq
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
init_seq = "TOM:"
|
||||||
|
output_text = get_header() + make_pretty(write_drama(init_seq))
|
||||||
|
with open("dramaoftheday.md", "w") as f:
|
||||||
|
f.write(output_text)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,78 @@
|
|||||||
|
# coding: utf-
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from tqdm import tqdm
|
||||||
|
import numpy as np
|
||||||
|
from dotd.data import ShakespeareDataset, preprocess, get_shakespeare
|
||||||
|
|
||||||
|
|
||||||
|
class GRU(torch.nn.Module):
|
||||||
|
def __init__(self, vocab_size, hidden_size, embedding_size, batch=32, layers=2):
|
||||||
|
super(GRU, self).__init__()
|
||||||
|
self.hidden_size = hidden_size # size of the GRU layers
|
||||||
|
self.batch = batch
|
||||||
|
self.layers = layers # how many GRU layers
|
||||||
|
self.word_embeds = torch.nn.Embedding(
|
||||||
|
vocab_size, embedding_size
|
||||||
|
) # Embedding layer
|
||||||
|
self.gru = torch.nn.GRU(
|
||||||
|
embedding_size, hidden_size, layers, batch_first=True
|
||||||
|
) # GRU layer(s)
|
||||||
|
self.output_layer = torch.nn.Linear(hidden_size, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, inputs, hidden):
|
||||||
|
x = self.word_embeds(
|
||||||
|
inputs
|
||||||
|
) # transform the input integer into a high dimensional embedding
|
||||||
|
output, hidden = self.gru(x, hidden) # Compute the output of the GRU layer(s)
|
||||||
|
output = self.output_layer(output) # compute the logits
|
||||||
|
return output, hidden
|
||||||
|
|
||||||
|
def init_hidden(self):
|
||||||
|
return torch.zeros(self.layers, self.batch, self.hidden_size)
|
||||||
|
|
||||||
|
|
||||||
|
def train_epoch(model, loader, optim, loss, device):
|
||||||
|
current_loss = [] # record running loss
|
||||||
|
model.to(device) # put the model on the specified device
|
||||||
|
hidden = model.init_hidden().to(device) # create the hidden state
|
||||||
|
model.train() # tell the model its training time
|
||||||
|
for X, y in loader:
|
||||||
|
X, y = X.to(device), y.to(
|
||||||
|
device
|
||||||
|
) # collect the data and labels from the dataloader and put them on the device
|
||||||
|
optim.zero_grad() # empty the gradients
|
||||||
|
output, hidden = model(X, hidden) # compute the output
|
||||||
|
hidden = hidden.detach() # take the hidden state out of the graph
|
||||||
|
batch_loss = loss(output.transpose(1, 2), y) # compute loss
|
||||||
|
batch_loss.backward() # compute gradients
|
||||||
|
optim.step() # update weights
|
||||||
|
current_loss.append(batch_loss.item()) # record loss
|
||||||
|
epoch_loss = np.mean(current_loss)
|
||||||
|
return epoch_loss
|
||||||
|
|
||||||
|
|
||||||
|
def train(epochs: int):
|
||||||
|
_, _, alphabet_size, _ = preprocess(
|
||||||
|
get_shakespeare()
|
||||||
|
) # get amount of characters for one-hot to embedding
|
||||||
|
gru = GRU(alphabet_size, 1024, 256, layers=2) # instantiate model
|
||||||
|
loss = torch.nn.CrossEntropyLoss()
|
||||||
|
optim = torch.optim.Adam(gru.parameters())
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
loader = DataLoader(
|
||||||
|
ShakespeareDataset(seq_len=100),
|
||||||
|
batch_size=32,
|
||||||
|
shuffle=True,
|
||||||
|
num_workers=2,
|
||||||
|
drop_last=True,
|
||||||
|
)
|
||||||
|
for e in tqdm(range(epochs)):
|
||||||
|
l = train_epoch(gru, loader, optim, loss, device)
|
||||||
|
print(f"Epoch: {e}, Loss: {l}")
|
||||||
|
torch.save(gru.state_dict(), "saved_models/gru_{}epochs.pth".format(epochs + 1))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train(20)
|
Loading…
Reference in new issue