First, I'm sorry.
I have poor English.
Thanks to Yoon Kim and dennybritz.
I edit your script to apply multi-labels support.
this script support multi-label data like next:
text1....\t label1
text2....\t label2
text3....\t label1
text4....\t label3
no vocabulary difference problem,
no label set diffenence problem,
no max sentence length difference problem,
no unseen word problem.
Just simply use
python train.py --train_data_path="./data/train.txt"
python eval.py --checkpoint_dir="./runs/1463968251/checkpoints/" --test_data_path="./data/test.txt"
Here I attached my script:
data_helpers.py
import codecs
import os.path
import numpy as np
import re
import itertools
from collections import Counter
PAD_MARK = "<PAD/>"
UNK_MARK = "<UNK/>"
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
# string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) # blocked to allow non-english char-set
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels( train_data_path ):
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
data = list()
labels = list()
for line in codecs.open( train_data_path, 'r', encoding='utf8' ).readlines() :
if 1 > len( line.strip() ) : continue;
t = line.split(u"\t");
if 2 != len(t) :
print "data format error" + line
continue;
data.append(t[0])
labels.append(t[1])
data = [s.strip() for s in data]
labels = [s.strip() for s in labels]
# Split by words
x_text = [clean_str(sent) for sent in data]
x_text = [s.split(u" ") for s in x_text]
return [x_text, labels]
def pad_sentences(sentences, max_sent_len_path):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
max_sequence_length = 0
# Load base max sent length
if len(max_sent_len_path) > 0 :
max_sequence_length = int( open( max_sent_len_path, 'r' ).readlines()[0] )
else :
max_sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
if max_sequence_length <= len(sentence) :
padded_sentences.append(sentence[:max_sequence_length])
continue
num_padding = max_sequence_length - len(sentence)
new_sentence = sentence + [PAD_MARK] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences, max_sequence_length
def build_vocab(sentences, base_vocab_path):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
vocabulary_inv = []
# Load base vocabulary
if len(base_vocab_path) > 0 :
vL = [ [w.strip()] for w in codecs.open( base_vocab_path, 'r', encoding='utf8' ).readlines() ]
c = Counter(itertools.chain(*vL))
vocabulary_inv = [x[0] for x in c.most_common()]
else :
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = vocabulary_inv + [x[0] for x in word_counts.most_common()]
if not UNK_MARK in vocabulary_inv :
vocabulary_inv.append(UNK_MARK)
vocabulary_inv = list(set(vocabulary_inv))
vocabulary_inv.sort()
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
if not UNK_MARK in vocabulary :
vocabulary[UNK_MARK] = vocabulary[PAD_MARK]
return [vocabulary, vocabulary_inv]
def make_onehot(idx, size) :
onehot = []
for i in range(size) :
if idx==i : onehot.append(1);
else : onehot.append(0);
return onehot
# end def
def make_label_dic(labels) :
"""
creator: [email protected]
create date: 2016.05.22
make 'label : one hot' dic
"""
label_onehot = dict()
onehot_label = dict()
for i, label in enumerate(labels) :
onehot = make_onehot(i,len(labels))
label_onehot[label] = onehot
onehot_label[str(onehot)] = label
return label_onehot, onehot_label
# end def
def build_onehot(labels, base_label_path):
"""
Builds a vocabulary mapping from label to onehot based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
uniq_labels = []
# Load base vocabulary
if len(base_label_path) > 0 :
vL = [ [w.strip()] for w in codecs.open( base_label_path, 'r', encoding='utf8' ).readlines() ]
c = Counter(itertools.chain(*vL))
uniq_labels = [x[0] for x in c.most_common()]
else :
# Build vocabulary
label_counts = Counter(labels)
# Mapping from index to word
uniq_labels = uniq_labels + [x[0] for x in label_counts.most_common()]
uniq_labels = list(set(uniq_labels))
uniq_labels.sort()
label_onehot, onehot_label = make_label_dic( uniq_labels )
return [uniq_labels, label_onehot, onehot_label]
def build_input_data(sentences, vocabulary, labels, label_onehot):
"""
Maps sentencs and labels to vectors based on a vocabulary.
"""
vL = []
for sentence in sentences :
wL = []
for word in sentence :
if word in vocabulary :
wL.append( vocabulary[word] )
else :
wL.append( vocabulary[UNK_MARK] )
vL.append(wL)
x = np.array(vL)
y = np.array([ label_onehot[label] for label in labels ])
return [x, y]
def load_data( train_data_path, checkpoint_dir="" ):
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
max_sent_len_path = "" if len(checkpoint_dir)<1 else checkpoint_dir+"/max_sent_len"
vocab_path = "" if len(checkpoint_dir)<1 else checkpoint_dir+"/vocab"
label_path = "" if len(checkpoint_dir)<1 else checkpoint_dir+"/label"
sentences, labels = load_data_and_labels( train_data_path )
sentences_padded, max_sequence_length = pad_sentences(sentences, max_sent_len_path)
vocabulary, vocabulary_inv = build_vocab(sentences_padded, vocab_path)
uniq_labels, label_onehot, onehot_label = build_onehot(labels, label_path)
x, y = build_input_data(sentences_padded, vocabulary, labels, label_onehot)
return [x, y, vocabulary, vocabulary_inv, onehot_label, max_sequence_length]
def batch_iter(data, batch_size, num_epochs, shuffle=True):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
train.py
#! /usr/bin/env python
import codecs
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
# Parameters
# ==================================================
# Model Hyperparameters
tf.flags.DEFINE_string("train_data_path", "./data/train.txt", "Data path to training")
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv, onehot_label, max_sequence_length = data_helpers.load_data( FLAGS.train_data_path )
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
print("Labels: %d: %s" % ( len(onehot_label), ','.join( onehot_label.values() ) ) )
print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
# Training
# ==================================================
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = TextCNN(
sequence_length=x_train.shape[1],
num_classes=len(onehot_label),
vocab_size=len(vocabulary),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# Define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.merge_summary(grad_summaries)
# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
# Summaries for loss and accuracy
loss_summary = tf.scalar_summary("loss", cnn.loss)
acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)
# Train Summaries
train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def)
# Dev summaries
dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
# Save additional model info
codecs.open( os.path.join(checkpoint_dir, "max_sent_len"), "w", encoding='utf8').write( str(max_sequence_length) )
codecs.open( os.path.join(checkpoint_dir, "vocab"), "w", encoding='utf8').write( '\n'.join(vocabulary_inv) )
codecs.open( os.path.join(checkpoint_dir, "label"), "w", encoding='utf8').write( '\n'.join(onehot_label.values()) )
saver = tf.train.Saver(tf.all_variables())
# Initialize all variables
sess.run(tf.initialize_all_variables())
def train_step(x_batch, y_batch):
"""
A single training step
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
train_summary_writer.add_summary(summaries, step)
def dev_step(x_batch, y_batch, writer=None):
"""
Evaluates model on a dev set
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 1.0
}
step, summaries, loss, accuracy = sess.run(
[global_step, dev_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
if writer:
writer.add_summary(summaries, step)
# Generate batches
batches = data_helpers.batch_iter(
list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
# Training loop. For each batch...
for batch in batches:
x_batch, y_batch = zip(*batch)
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
print("\nEvaluation:")
dev_step(x_dev, y_dev, writer=dev_summary_writer)
print("")
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))
eval.py
#! /usr/bin/env python
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
# Parameters
# ==================================================
# Eval Parameters
tf.flags.DEFINE_string("test_data_path", "./data/test.txt", "Data path to evaluation")
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
# Load data. Load your own data here
print("Loading data...")
x_test, y_test, vocabulary, vocabulary_inv, onehot_label, max_sequence_length = data_helpers.load_data( FLAGS.test_data_path, FLAGS.checkpoint_dir )
y_test = np.argmax(y_test, axis=1)
print("Labels: %d: %s" % ( len(onehot_label), ','.join( sorted(onehot_label.values()) ) ) )
print("Vocabulary size: {:d}".format(len(vocabulary)))
print("Test set size {:d}".format(len(y_test)))
print("\nEvaluating...\n")
# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
# Load the saved meta graph and restore variables
print "FLAGS.checkpoint_dir %s" % FLAGS.checkpoint_dir
print "checkpoint_file %s" % checkpoint_file
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
saver.restore(sess, checkpoint_file)
# Get the placeholders from the graph by name
input_x = graph.get_operation_by_name("input_x").outputs[0]
# input_y = graph.get_operation_by_name("input_y").outputs[0]
dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
# Tensors we want to evaluate
predictions = graph.get_operation_by_name("output/predictions").outputs[0]
# Generate batches for one epoch
batches = data_helpers.batch_iter(x_test, FLAGS.batch_size, 1, shuffle=False)
# Collect the predictions here
all_predictions = []
for x_test_batch in batches:
batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
all_predictions = np.concatenate([all_predictions, batch_predictions])
# Print accuracy
print "y_test: " + str(y_test)
print "all_predictions: " + str(all_predictions)
correct_predictions = float(sum(all_predictions == y_test))
print("Total number of test examples: {}".format(len(y_test)))
print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))