(AI) Building a text dictionary

Post by **Antonio Linares** » Mon Aug 28, 2017 10:34 am

Generic Python code to build a dictionary from a given text file:

text.py

dictionary = {}   # hash table

text = open( 'original_english.txt' ).read().split( "." )

for sentence in text :
   sentence = sentence.replace( "\n", " " ).replace( ",", "" )
   
   for word in sentence.split( " " ) :
      print( word )
      if word not in dictionary :
         dictionary[ word ] = 1
      else :
         dictionary[ word ] += 1
         
print( dictionary )

Post by **Antonio Linares** » Mon Aug 28, 2017 10:43 am

Enhanced version:

text.py

Code: Select all

dictionary = {}   # hash table

text = open( 'original_english.txt' ).read().split( "." )

for sentence in text :
   sentence = sentence.replace( "\n", " " ).replace( ",", "" ).replace( "\x0c", "" ).replace( "!", "" ).replace( '"', "" )
   
   for word in sentence.split( " " ) :
      print( word )
      if word not in dictionary :
         dictionary[ word ] = 1
      else :
         dictionary[ word ] += 1
         
print( sorted( dictionary ) )

Post by **Antonio Linares** » Mon Aug 28, 2017 10:59 am

https://github.com/zackthoutt/got-book- ... ator.ipynb

Thanks to Felix

Post by **Antonio Linares** » Mon Aug 28, 2017 1:20 pm

Understanding the creation of the dictionaries:

hash.py

Code: Select all

words = [ "hello", "world" ]

int_to_word = { key: word for key, word in enumerate( words ) }
word_to_int = { word: key for key, word in enumerate( words ) }

print( int_to_word )
print( word_to_int )

python hash.py
{0: 'hello', 1: 'world'}
{'hello': 0, 'world': 1}

How to code this using Harbour ?

Post by **Antonio Linares** » Mon Aug 28, 2017 7:51 pm

Long Short Term Memory networks – usually just called “LSTMs” – are a special kind of RNN, capable of learning long-term dependencies. They were introduced by Hochreiter & Schmidhuber (1997)

Long short-term memory (LSTM) is a recurrent neural network (RNN) architecture that remembers values over arbitrary intervals

https://en.wikipedia.org/wiki/Long_short-term_memory

https://medium.com/towards-data-science ... b0c1968537

Post by **Antonio Linares** » Wed Aug 30, 2017 8:18 am

book_train.py

Code: Select all

import numpy as np
import tensorflow as tf
import glob

book_filenames = sorted( glob.glob( "*.txt" ) )

print( "Found {} books".format( len( book_filenames ) ) )

import codecs

corpus_raw = u""

for filename in book_filenames :
    # with codecs.open(filename, 'r', 'utf-8') as book_file:
    #    corpus_raw += book_file.read()
    corpus_raw += open( filename ).read()

print( "Corpus is {} characters long".format( len( corpus_raw ) ) )

def create_lookup_tables(text):
    """
    Create lookup tables for vocab
    :param text: The GOT text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = set(text)
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab
    
def token_lookup():
    """
    Generate a dict to map punctuation into a token
    :return: dictionary mapping puncuation to token
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotes||',
        ';': '||semicolon||',
        '!': '||exclamation-mark||',
        '?': '||question-mark||',
        '(': '||left-parentheses||',
        ')': '||right-parentheses||',
        '--': '||emm-dash||',
        '\n': '||return||'
        
    }
    
import pickle

token_dict = token_lookup()
for token, replacement in token_dict.items():
    corpus_raw = corpus_raw.replace(token, ' {} '.format(replacement))
corpus_raw = corpus_raw.lower()
corpus_raw = corpus_raw.split()

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))    
    
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)
    
num_epochs = 10000
batch_size = 512
rnn_size = 512
num_layers = 3
keep_prob = 0.7
embed_dim = 512
seq_length = 30
learning_rate = 0.001
save_dir = './'    

train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    
import time

pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed,
            ((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))

        # save model every 10 epochs
        if epoch % 10 == 0:
            saver = tf.train.Saver()
            saver.save(sess, save_dir)
            print('Model Trained and Saved')

book_gen.py

Code: Select all

import tensorflow as tf
import numpy as np
import pickle

corpus_int, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocess.p', mode='rb'))
seq_length, save_dir = pickle.load(open('params.p', mode='rb'))

def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]

gen_length = 1000
prime_words = 'time'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})

        pred_word = pick_word( probabilities[ 0, dyn_seq_length - 1 ], int_to_vocab )

        gen_sentences.append( pred_word )
        
    # Remove tokens
    chapter_text = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        chapter_text = chapter_text.replace(' ' + token.lower(), key)
        
    print( chapter_text )

chapter_text = ' '.join(gen_sentences)
for key, token in token_dict.items():
    chapter_text = chapter_text.replace(' ' + token.lower(), key)
chapter_text = chapter_text.replace('\n ', '\n')
chapter_text = chapter_text.replace('( ', '(')

capitalize_words = ['juan']

for word in capitalize_words:
    chapter_text = chapter_text.replace(word, word.lower().title())

import os
version_dir = './generated-book-v1'
if not os.path.exists(version_dir):
    os.makedirs(version_dir)

num_chapters = len([name for name in os.listdir(version_dir) if os.path.isfile(os.path.join(version_dir, name))])
next_chapter = version_dir + '/chapter-' + str(num_chapters + 1) + '.md'
with open(next_chapter, "w") as text_file:
    text_file.write(chapter_text)

FiveTech Software tech support forums

(AI) Building a text dictionary

(AI) Building a text dictionary

Re: (AI) Building a text dictionary

Re: (AI) Building a text dictionary

Re: (AI) Building a text dictionary

Re: (AI) Building a text dictionary

Re: (AI) Building a text dictionary