(Python) The Attention Mechanism

Attention 구조


import numpy as np
from scipy.special import softmax  #출력층에서 softmax함수 사용

Step1.입력 표현하기
모든 입력은 세 가지 표현(Key,Query,Value)를 가져야 한다.

print("Step 1: 3 inputs, d_model=4")  #d_model : 입력벡터 x의 차원 설정(원래는 512인데 4로 축소)
x =np.array([[1.0, 0.0, 1.0, 0.0],    # Input 1 
             [0.0, 2.0, 0.0, 2.0],    # Input 2
             [1.0, 1.0, 1.0, 1.0]])   # Input 3   --> input마다 4차원의 벡터로 만들어준다. 
print(x)

image
image

Step2.가중치 행렬 초기화
Vaswani(2017) 에서 기술된 가중치 행렬은 64차원이지만 3차원으로 축소한다.

print("Step 2: weights 3 dimensions x d_model=4")  #4차원 입력벡터 x의 각 요소마다 3차원의 가중치 행렬을 부여한다.

#쿼리
print("w_query")
w_query =np.array([[1, 0, 1],     #input 첫번째 요소에 대한 3차원 Query가중치
                   [1, 0, 0],     #input 두번째 요소에 대한 3차원 Query가중치     
                   [0, 0, 1],     #input 세번째 요소에 대한 3차원 Query가중치
                   [0, 1, 1]])    #input 네번째 요소에 대한 3차원 Query가중치
print(w_query)


#키
print("w_key")
w_key =np.array([[0, 0, 1],     #input 첫번째 요소에 대한 3차원 Key가중치
                 [1, 1, 0],     #input 두번째 요소에 대한 3차원 Key가중치
                 [0, 1, 0],     #input 세번째 요소에 대한 3차원 Key가중치
                 [1, 1, 0]])    #input 네번째 요소에 대한 3차원 Key가중치
print(w_key)


#밸류
print("w_value")
w_value = np.array([[0, 2, 0],     #input 첫번째 요소에 대한 3차원 value가중치
                    [0, 3, 0],     #input 두번째 요소에 대한 3차원 value가중치
                    [1, 0, 3],     #input 세번째 요소에 대한 3차원 value가중치
                    [1, 1, 0]])    #input 네번째 요소에 대한 3차원 value가중치
print(w_value)

image
모델에 가중치 행렬 추가됨

Step3.Q,K,V를 얻기 위한 행렬 곱셈

# x는 (3,1,4)차원을 갖고 w_query는 (4,1,3)차원을 가진다.
# 결과는 (3,1,3)
#쿼리
print("Queries: x * w_query")
Q=np.matmul(x,w_query)     #input(3x1x4)과 쿼리가중치행렬(4x1x3)을 곱한다. -->결과 (3x1x3)행렬
print(Q)

#키
print("Keys: x * w_key")
K=np.matmul(x,w_key)     #input(1x4)과 키가중치행렬(4x3)을 곱한다. -->결과 1x3행렬
print(K)

#밸류
print("Values: x * w_value")
V=np.matmul(x,w_value)     #input(1x4)과 밸류가중치행렬(4x3)을 곱한다. -->결과 1x3행렬
print(V)

image
image
image

inputQKV
input1[1,0,2][0,1,1][1,2,3]
input2[2,2,2][4,4,0][2,8,0]
input3[2,1,3][2,3,1][2,6,3]

Step4. 스케일링된 어텐션 점수 image

print("Step 4: Scaled Attention Scores")
k_d=1   #square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)

Step5. 각 벡터에 대한 스케일링 된 softmax어텐션 점수(step4를 소프트맥스 함수에 대입)

attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step6. 최종 어텐션 표현(step5에 V곱하기), input1,2,3모두

print("input1 Attention 1")
input1_attention1=attention_scores[0].reshape(-1,1)
input1_attention1=attention_scores[0][0]*V[0]
print(input1_attention1)

print("input1 Attention 2")
input1_attention2=attention_scores[0][1]*V[1]
print(input1_attention2)

print("input1 Attention 3")
input1_attention3=attention_scores[0][2]*V[2]
print(input1_attention3)




print("input2 Attention 1")
input2_attention1=attention_scores[1].reshape(-1,1)
input2_attention1=attention_scores[1][0]*V[0]
print(input2_attention1)

print("input2 Attention 2")
input2_attention2=attention_scores[1][1]*V[1]
print(input2_attention2)

print("input2 Attention 3")
input2_attention3=attention_scores[1][2]*V[2]
print(input2_attention3)




print("input3 Attention 1")
input3_attention1=attention_scores[2].reshape(-1,1)
input3_attention1=attention_scores[2][0]*V[0]
print(input3_attention1)

print("input3 Attention 2")
input3_attention2=attention_scores[2][1]*V[1]
print(input3_attention2)

print("input3 Attention 3")
input3_attention3=attention_scores[2][2]*V[2]
print(input3_attention3)

Step7. 결과합산

attention_input1=input1_attention1+input1_attention2+input1_attention3    #input1에 대한
print(attention_input1)

attention_input2=input2_attention1+input2_attention2+input2_attention3    #input1에 대한
print(attention_input2)

attention_input3=input3_attention1+input3_attention2+input3_attention3    #input1에 대한
print(attention_input3)

image

Step8. d_model을 64차원으로 확장

#We assume we have 3 results with learned weights (they were not trained in this example)
#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each
attention_head1=np.random.random((3, 64))  #64차원 결과 3개를 갖는다고 가정
print(attention_head1)
import numpy as np
from scipy.special import softmax

print("Step 1: 3 inputs, d_model=64")  #d_model : 64
x =np.random.random((3,64)) 
print(x)

print("Step 2: weights 3 dimensions x d_model=4")  

#쿼리
print("w_query")
w_query =np.random.random((64,3))
print(w_query)


#키
print("w_key")
w_key =np.random.random((64,3))
print(w_key)


#밸류
print("w_value")
w_value = np.random.random((64,3))
print(w_value)

#
#쿼리
print("Queries: x * w_query")
Q=np.matmul(x,w_query)    
print(Q)

#키
print("Keys: x * w_key")
K=np.matmul(x,w_key)     
print(K)

#밸류
print("Values: x * w_value")
V=np.matmul(x,w_value)    
print(V)

print("Step 4: Scaled Attention Scores")
k_d=1.73   #square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)

attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

print("input1 Attention 1")
input1_attention1=attention_scores[0].reshape(-1,1)
input1_attention1=attention_scores[0][0]*V[0]
print(input1_attention1)

print("input1 Attention 2")
input1_attention2=attention_scores[0][1]*V[1]
print(input1_attention2)

print("input1 Attention 3")
input1_attention3=attention_scores[0][2]*V[2]
print(input1_attention3)




print("input2 Attention 1")
input2_attention1=attention_scores[1].reshape(-1,1)
input2_attention1=attention_scores[1][0]*V[0]
print(input2_attention1)

print("input2 Attention 2")
input2_attention2=attention_scores[1][1]*V[1]
print(input2_attention2)

print("input2 Attention 3")
input2_attention3=attention_scores[1][2]*V[2]
print(input2_attention3)




print("input3 Attention 1")
input3_attention1=attention_scores[2].reshape(-1,1)
input3_attention1=attention_scores[2][0]*V[0]
print(input3_attention1)

print("input3 Attention 2")
input3_attention2=attention_scores[2][1]*V[1]
print(input3_attention2)

print("input3 Attention 3")
input3_attention3=attention_scores[2][2]*V[2]
print(input3_attention3)

attention_input1=input1_attention1+input1_attention2+input1_attention3    #input1에 대한
print(attention_input1)

attention_input2=input2_attention1+input2_attention2+input2_attention3    #input2에 대한
print(attention_input2)

attention_input3=input3_attention1+input3_attention2+input3_attention3    #input3에 대한
print(attention_input3)

z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

#@title Transformer Installation
!pip -qq install transformers




######Step12. 영어를 프랑스어로 번역######
from transformers import pipeline     #Transformer에서 pipeline모듈 import
translator = pipeline("translation_en_to_fr") #en to fr옵션 선택
#One line of code!
print(translator("It is easy to translate languages with transformers", max_length=40)) 

Step9. 어텐션 서브레이어 헤드들의 출력

z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

Step10. 헤드 출력 잇기

print("Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model")
output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention)
#@title Transformer Installation
!pip -qq install transformers


######Step12. 영어를 프랑스어로 번역######
from transformers import pipeline     #Transformer에서 pipeline모듈 import
translator = pipeline("translation_en_to_fr") #en to fr옵션 선택
#One line of code!
print(translator("It is easy to translate languages with transformers", max_length=40))