Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: IBM/pytorch-seq2seq
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: IBM/pytorch-seq2seq
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: ptrnet
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 3 commits
  • 6 files changed
  • 2 contributors

Commits on Sep 7, 2017

  1. Copy the full SHA
    c9b14a7 View commit details

Commits on Sep 13, 2017

  1. Copy the full SHA
    1727a2f View commit details

Commits on Sep 21, 2017

  1. temporary commit.

    kylegao91 committed Sep 21, 2017
    Copy the full SHA
    d80e0ce View commit details
Showing with 90 additions and 16 deletions.
  1. +8 −4 examples/sample.py
  2. +2 −1 scripts/generate_toy_data.py
  3. +1 −0 seq2seq/loss/loss.py
  4. +20 −11 seq2seq/models/DecoderRNN.py
  5. +35 −0 seq2seq/models/attention.py
  6. +24 −0 tests/test_ptr_attention.py
12 changes: 8 additions & 4 deletions examples/sample.py
Original file line number Diff line number Diff line change
@@ -86,9 +86,13 @@ def len_filter(example):
# seq2seq.tgt_field_name = 'tgt'

# Prepare loss
weight = torch.ones(len(tgt.vocab))
pad = tgt.vocab.stoi[tgt.pad_token]
loss = Perplexity(weight, pad)
# weight = torch.ones(len(tgt.vocab))
# pad = tgt.vocab.stoi[tgt.pad_token]
# loss = Perplexity(weight, pad)
# if torch.cuda.is_available():
# loss.cuda()

loss = Perplexity(None, None)
if torch.cuda.is_available():
loss.cuda()

@@ -100,7 +104,7 @@ def len_filter(example):
encoder = EncoderRNN(len(src.vocab), max_len, hidden_size,
variable_lengths=True)
decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size,
dropout_p=0.2, use_attention=True,
dropout_p=0.2, attention='pointer',
eos_id=tgt.eos_id, sos_id=tgt.sos_id)
seq2seq = Seq2seq(encoder, decoder)
if torch.cuda.is_available():
3 changes: 2 additions & 1 deletion scripts/generate_toy_data.py
Original file line number Diff line number Diff line change
@@ -22,7 +22,8 @@ def generate_dataset(root, name, size):
seq = []
for _ in range(length):
seq.append(str(random.randint(0, 9)))
fout.write("\t".join([" ".join(seq), " ".join(reversed(seq))]))
# fout.write("\t".join([" ".join(seq), " ".join(reversed(seq))]))
fout.write("\t".join([" ".join(seq), " ".join([str(t) for t in reversed(range(len(seq)))])]))
fout.write('\n')

# generate vocabulary
1 change: 1 addition & 0 deletions seq2seq/loss/loss.py
Original file line number Diff line number Diff line change
@@ -135,6 +135,7 @@ def __init__(self, weight=None, mask=None):
super(Perplexity, self).__init__(weight=weight, mask=mask, size_average=False)

def eval_batch(self, outputs, target):
print(outputs.size(), target.size())
self.acc_loss += self.criterion(outputs, target)
if self.mask is None:
self.norm_term += np.prod(target.size())
31 changes: 20 additions & 11 deletions seq2seq/models/DecoderRNN.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
from torch.autograd import Variable
import torch.nn.functional as F

from .attention import Attention
from .attention import Attention, PointerAttention
from .baseRNN import BaseRNN

if torch.cuda.is_available():
@@ -30,7 +30,8 @@ class DecoderRNN(BaseRNN):
rnn_cell (str, optional): type of RNN cell (default: gru)
input_dropout_p (float, optional): dropout probability for the input sequence (default: 0)
dropout_p (float, optional): dropout probability for the output sequence (default: 0)
use_attention(bool, optional): flag indication whether to use attention mechanism or not (default: false)
attention(str, optional): type of attention mechanism (global, pointer),
No attention mechanism is used if None. (default: None)
Attributes:
KEY_ATTN_SCORE (str): key used to indicate attention weights in `ret_dict`
@@ -67,24 +68,30 @@ class DecoderRNN(BaseRNN):
def __init__(self, vocab_size, max_len, hidden_size,
sos_id, eos_id,
n_layers=1, rnn_cell='gru',
input_dropout_p=0, dropout_p=0, use_attention=False):
input_dropout_p=0, dropout_p=0, attention=None):
super(DecoderRNN, self).__init__(vocab_size, max_len, hidden_size,
input_dropout_p, dropout_p,
n_layers, rnn_cell)

self.output_size = vocab_size
self.max_length = max_len
self.use_attention = use_attention
self.eos_id = eos_id
self.sos_id = sos_id

self.init_input = None

self.embedding = nn.Embedding(self.output_size, self.hidden_size)
if use_attention:
if attention == 'global':
self.attention = Attention(self.hidden_size)

self.out = nn.Linear(self.hidden_size, self.output_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
elif attention == 'pointer':
self.attention = PointerAttention(self.hidden_size)
self.out = lambda x: x
elif attention is None:
self.attention = None
self.out = nn.Linear(self.hidden_size, self.output_size)
else:
raise ValueError("Attention type: %s is not supported." % attention)

def forward_step(self, input_var, hidden, encoder_outputs, function):
batch_size = input_var.size(0)
@@ -95,16 +102,18 @@ def forward_step(self, input_var, hidden, encoder_outputs, function):
output, hidden = self.rnn(embedded, hidden)

attn = None
if self.use_attention:
if self.attention is not None:
output, attn = self.attention(output, encoder_outputs)
print(self.out(output).size())
predicted_softmax = function(self.out(output.view(batch_size * output_size, -1))) \
.view(batch_size, output_size, -1)

predicted_softmax = function(self.out(output.view(-1, self.hidden_size))).view(batch_size, output_size, -1)
return predicted_softmax, hidden, attn

def forward(self, inputs=None, encoder_hidden=None, function=F.log_softmax,
encoder_outputs=None, teacher_forcing_ratio=0):
ret_dict = dict()
if self.use_attention:
if self.attention is not None:
if encoder_outputs is None:
raise ValueError("Argument encoder_outputs cannot be None when attention is used.")
ret_dict[DecoderRNN.KEY_ATTN_SCORE] = list()
@@ -139,7 +148,7 @@ def forward(self, inputs=None, encoder_hidden=None, function=F.log_softmax,

def decode(step, step_output, step_attn):
decoder_outputs.append(step_output)
if self.use_attention:
if self.attention is not None:
ret_dict[DecoderRNN.KEY_ATTN_SCORE].append(step_attn)
symbols = decoder_outputs[-1].topk(1)[1]
sequence_symbols.append(symbols)
35 changes: 35 additions & 0 deletions seq2seq/models/attention.py
Original file line number Diff line number Diff line change
@@ -70,3 +70,38 @@ def forward(self, output, context):
output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)

return output, attn

class PointerAttention(nn.Module):
r"""
Applies an pointer attention mechanism on the output features from the decoder.
Args:
dim(int): The number of expected features in the output
Inputs: output, context
- **output** (batch, output_len, dimensions): tensor containing the output features from the decoder.
- **context** (batch, input_len, dimensions): tensor containing features of the encoded input sequence.
Outputs: output, attn
- **output** (batch, output_len, input_len): tensor containing the attended output features from the decoder.
- **attn** (batch, output_len, input_len): tensor containing attention weights.
"""
def __init__(self, dim):
super(PointerAttention, self).__init__()
self.dec_linear = nn.Linear(dim, dim, bias=False)
self.enc_linear = nn.Linear(dim, dim, bias=False)
self.out_linear = nn.Linear(dim, 1, bias=False)

def forward(self, output, context):
batch_size = output.size(0)
hidden_size = output.size(2)
out_len = output.size(1)
in_len = context.size(1)

# (batch_size, out_len, dim) -> (batch_size * out_len, dim) -> (batch_size * out_len, dim)
dec = self.dec_linear(output.contiguous().view(-1, hidden_size))
dec = dec.view(batch_size, out_len, 1, hidden_size).expand(batch_size, out_len, in_len, hidden_size)
# (batch_size, in_len, dim) - > (batch_size * in_len, dim) -> (batch_size * in_len, dim)
enc = self.enc_linear(context.contiguous().view(-1, hidden_size))
enc = enc.view(batch_size, 1, in_len, hidden_size).expand(batch_size, out_len, in_len, hidden_size)
# (batch_size, out_len, in_len, dim) -> (batch_size, out_len, in_len)
attn = self.out_linear((F.tanh(enc + dec).view(-1, hidden_size))).view(batch_size, out_len, in_len)

return attn, attn
24 changes: 24 additions & 0 deletions tests/test_ptr_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import unittest

import torch
from torch.autograd import Variable

from seq2seq.models.attention import PointerAttention

class TestDecoderRNN(unittest.TestCase):

def test_shape(self):
batch_size = 8
input_len = 10
output_len = 11
hidden_size = 16

ptr_attn = PointerAttention(hidden_size)

output = Variable(torch.randn(batch_size, output_len, hidden_size))
context = Variable(torch.randn(batch_size, input_len, hidden_size))

output, attn = ptr_attn(output, context)

self.assertEqual((batch_size, output_len, input_len), output.size())