(Python) A Positional Encoding Example
Step1. 필요한 모듈 불러오기
!pip install gensim==3.8.3 #word2vec같은 패키지를 포함한 라이브러리
import torch
import nltk #자연어처리 패키지
nltk.download('punkt')
import math
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = 'ignore')
Step2.파일열기*
#‘text.txt’ file
sample = open("text.txt", "r")
s = sample.read() #'s'라는 변수 안에 text.txt의 내용들이 들어감.
# processing escape characters
f = s.replace("\n", " ") #문자열변경함수, "\n"을 " "으로 변경. 줄바꿈 되어 있던 걸 한 줄에 나오도록...
Step3.토큰화해서 data만들기
data = []
# sentence parsing
for i in sent_tokenize(f): #step2에서 만든 문자열데이터 f에 대해 sent_tokenize 적용. 문장을 각각 묶어줌.
temp = []
# tokenize the sentence into words
for j in word_tokenize(i): #sent_tokenize를 통해 묶어진 문장마다 word_tokenize적용.
temp.append(j.lower())
data.append(temp)
sent_tokenize
word_tokenize
data
Step4. skip-gram 모델 형성
# Creating Skip Gram model
#model2 = gensim.models.Word2Vec(data, min_count = 1, size = 512,window = 5, sg = 1)
#model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 512,window = 5, sg = 1)
# 1-The 2-black 3-cat 4-sat 5-on 6-the 7-couch 8-and 9-the 10-brown 11-dog 12-slept 13-on 14-the 15-rug.
word1='black' #중심단어
word2='brown' #중심단어
pos1=2 #'black'위치
pos2=10 #'brown'위치
a=model2[word1] #model2를 이용해 word2vec적용
b=model2[word2]
Step5. 유사도 계산
# compute cosine similarity
dot = np.dot(a, b)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
aa = a.reshape(1,512) #word similarity계산에 사용
ba = b.reshape(1,512) #word similarity계산에 사용
cos_lib = cosine_similarity(aa, ba) #cosine 유사도
pe1=aa.copy() #positional similarity계산에 사용
pe2=aa.copy() #positional similarity계산에 사용
pe3=aa.copy()
paa=aa.copy() #"black"
pba=ba.copy() #"brown"
d_model=512
max_print=d_model
max_length=20
for i in range(0, max_print,2):
pe1[0][i] = math.sin(pos1 / (10000 ** ((2 * i)/d_model))) #positional similarity계산에 사용
paa[0][i] = (paa[0][i]*math.sqrt(d_model))+ pe1[0][i] #positional encoding similarity계산에 사용
pe1[0][i+1] = math.cos(pos1 / (10000 ** ((2 * i)/d_model))) #positional similarity계산에 사용
paa[0][i+1] = (paa[0][i+1]*math.sqrt(d_model))+pe1[0][i+1] #positional encoding similarity계산에 사용
if dprint==1:
print(i,pe1[0][i],i+1,pe1[0][i+1])
print(i,paa[0][i],i+1,paa[0][i+1])
print("\n")
#print(pe1)
# A method in Pytorch using torch.exp and math.log :
max_len=max_length
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
#print(pe[:, 0::2])
for i in range(0, max_print,2):
pe2[0][i] = math.sin(pos2 / (10000 ** ((2 * i)/d_model))) #positional similarity계산에 사용
pba[0][i] = (pba[0][i]*math.sqrt(d_model))+ pe2[0][i] #positional encoding similarity계산에 사용
pe2[0][i+1] = math.cos(pos2 / (10000 ** ((2 * i)/d_model))) #positional similarity계산에 사용
pba[0][i+1] = (pba[0][i+1]*math.sqrt(d_model))+ pe2[0][i+1] #positional encoding similarity계산에 사용
if dprint==1:
print(i,pe2[0][i],i+1,pe2[0][i+1])
print(i,paa[0][i],i+1,paa[0][i+1])
print("\n")
결과
print(word1,word2)
cos_lib = cosine_similarity(aa, ba)
print(cos_lib,"word similarity")
cos_lib = cosine_similarity(pe1, pe2)
print(cos_lib,"positional similarity")
cos_lib = cosine_similarity(paa, pba)
print(cos_lib,"positional encoding similarity")
if dprint==1:
print(word1)
print("embedding")
print(aa)
print("positional encoding")
print(pe1)
print("encoded embedding")
print(paa)
print(word2)
print("embedding")
print(ba)
print("positional encoding")
print(pe2)
print("encoded embedding")
print(pba)