塞进裤子ヾ(≧O≦)〃嗷~

0%

PyTorch官方教程2-word Embedding

N-gram model

在n-gram模型中,给定一个单词序列向量w,要计算:

$$P(w_i|w_{i-1},w_{i-2},…,w_{i-n+1}$$)

embedding层->Linear->relu->linear->logsoftmax

$$-logp(w_i|w_{i-1},w_{i-2}) = -logSoftmax(A2(A1q_w+b1)+b2)$$

$q_w$是前两个单词向量的拼接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(666)

CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_data = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# 创建一系列的元组,每个元组都是([word_i-2, word_i-1], target word)的形式。
trigrams = [([test_data[i],test_data[i+1]],test_data[i+2])
for i in range(len(test_data)-2)]

vocab = set(test_data)
VOCAB_SIZE = len(vocab)
word_to_id = {}
for id,word in enumerate(vocab):
word_to_id[word] = id

class NGramModel(nn.Module):
def __init__(self,vocab_size, embedding_dim, context_size):
super(NGramModel, self).__init__()
self.embedding = nn.Embedding(vocab_size,embedding_dim)
self.linear1 = nn.Linear(CONTEXT_SIZE*embedding_dim,128)
self.linear2 = nn.Linear(128,vocab_size)

def forward(self, inputs):
embeds = self.embedding(inputs)
out = F.relu(self.linear1(embeds.view(1,-1)))
out = self.linear2(out)
log_probs = F.log_softmax(out,dim =1)
return log_probs


model = NGramModel(VOCAB_SIZE,EMBEDDING_DIM,CONTEXT_SIZE)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr =0.001)


def train():
model.train()
for epoch in range(50):
total_loss = 0
for context, target in trigrams:
model.zero_grad()
#输入到nn.Embedding()里的是整数索引
context_ids = torch.LongTensor([word_to_id[w] for w in context])
log_probs = model(context_ids)
target = torch.LongTensor([word_to_id[target]])
loss = loss_function(log_probs,target)
loss.backward()
optimizer.step()
total_loss += loss.item()
print("epoch=",epoch, " total_loss=",total_loss)

train()

CBOW Model

给定一个单词$w_i$,N代表两边的滑窗距,两边各为N,将所有的上下文此统称为$C$

CBOW模型试图最小化

$$-logp(w_i|C) = -logSoftmax(A(\sum_{w \in C}q_w)+b)$$

其中,$q_w$表示w的词嵌入。

embedding层-> sum -> linear -> logsoftmax

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(666)

CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_data = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# 创建一系列的元组,每个元组都是([word_i-2, word_i-1], target word)的形式。
data = []
for i in range(2, len(test_data)-2):
context = [test_data[i-2],test_data[i-1], test_data[i+1], test_data[i+2]]
target = test_data[i]
data.append((context, target))

vocab = set(test_data)
word_to_id = {}
for id,word in enumerate(vocab):
word_to_id[word] = id


VOCAB_SIZE = len(vocab)
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
EMBEDDING_DIM = 10
EPOCHS = 150

class CBOW(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size):
super(CBOW, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim,128) #因为求和,所以输入的size是1个embeddingdim
self.linear2 = nn.Linear(128, vocab_size)

def forward(self, x):
#TODO:sum
embeds = sum(self.embedding(x)).view(1,-1) #x是整数索引
t1 = F.relu(self.linear1(embeds))
t2 = self.linear2(t1)
out = F.log_softmax(t2,dim = 1)
return out


loss_fn = nn.NLLLoss()
model = CBOW(VOCAB_SIZE,EMBEDDING_DIM,CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr =0.001)

def train():
model.train()
for epoch in range(EPOCHS):
total_loss = 0
for context, target in data:
model.zero_grad()
context_ids = torch.LongTensor([word_to_id[w] for w in context])
target_ids = torch.LongTensor([word_to_id[target]])
log_probs = model(context_ids)
loss = loss_fn(log_probs,target_ids)
loss.backward()
optimizer.step()

total_loss += loss.item()
print("epoch=",epoch," loss=",total_loss)
train()
if help:小手一抖点个广告 or 大手一挥资助一下