from typing import Mapping, Union
import numpy as np
from .operation import Operation
from .op_placeholder import OpPlaceholder
[docs]class OpDot(Operation):
"""Dot product of two tensors.
**If** either `x` or `y` is a scalar, then it is equivalent to :class:`OpMultiply`.
**If** both `x` and `y` are 1-D arrays, it is the inner product of vectors, the result is a scalar:
.. math::
z = \sum_k x_{k} \cdot y_{k}
Partial derivatives of a single element:
.. math::
\\frac{\\partial L}{\\partial x_{i}} =
\\frac{\partial L}{\\partial z} \\cdot \\frac{\\partial z}{\\partial x_i} =
\\frac{\\partial L}{\\partial z} \\cdot y_i
.. math::
\\frac{\\partial L}{\\partial y_{i}} =
\\frac{\\partial L}{\\partial z} \\cdot \\frac{\\partial z}{\\partial y_i} =
\\frac{\\partial L}{\\partial z} \\cdot x_i
Vector derivatives:
.. math::
\\frac{\\partial L}{\\partial x} = \\frac{\\partial L}{\\partial z} \\cdot y
.. math::
\\frac{\\partial L}{\\partial y} = \\frac{\\partial L}{\\partial z} \\cdot x
Note that since `z` is a scalar, the calculation of vector derivatives in this case is the dot operation.
**If** both `x` and `y` are 2-D arrays, it is the matrix multiplication, the result is a 2-D array:
.. math::
z_{ij} = \sum_{k} x_{ik} \cdot y_{kj}
Partial derivative of a single element:
.. math::
\\begin{array}{rcl}
\\displaystyle \\frac{\\partial L}{\\partial x_{ij}}
&=& \\displaystyle \\sum_{a,b} \\frac{\\partial L}{\\partial z_{ab}} \\cdot
\\frac{\\partial z_{ab}}{\\partial x_{ij}} \\\\
&=& \\displaystyle \\sum_{a,b} \\frac{\\partial L}{\\partial z_{ab}} \\cdot
\\frac{\\partial \\left ( \\sum_k x_{ak} \\cdot y_{kb} \\right )}{\\partial x_{ij}} \\\\
&=& \\displaystyle \\sum_{b} \\frac{\\partial L}{\\partial z_{ib}} \\cdot
\\frac{\\partial \\left ( x_{ij} \\cdot y_{jb} \\right )}{\\partial x_{ij}} \\\\
&=& \\displaystyle \\sum_{k} \\frac{\\partial L}{\\partial z_{ik}} \\cdot
y_{jk} \\\\
&=& \\displaystyle \\sum_{k} \\left ( \\frac{\\partial L}{\\partial Z} \\right )_{ik} \\cdot (Y^T)_{kj} \\\\
\\end{array}
.. math::
\\frac{\\partial L}{\\partial y_{ij}} = \\sum_{k} (X^T)_{ik} \\cdot
\\left ( \\frac{\\partial L}{\\partial Z} \\right )_{kj}
The results of partial derivatives are the same as the definition of the dot operation, therefore the matrix
derivatives are:
.. math::
\\frac{\\partial L}{\\partial X} = \\frac{\\partial L}{\\partial Z} \\cdot Y^T
.. math::
\\frac{\\partial L}{\\partial Y} = X^T \\cdot \\frac{\\partial L}{\\partial Z}
**If** `x` is an N-D tensor and `y` is an M-D tensor (M >= 2), it is a sum product over the last axis of `x` and
second-to-last axis of `y`.
**If** `x` is an N-D tensor and `y` is a 1-D array, it is a sum product over the last axis of `x` and `y`.
It is a special case of the previous condition if `y` is considered as a K x 1 matrix and result is squeezed.
"""
[docs] def __init__(self, x: Operation, y: Operation, **kwargs):
self.inputs = [x, y]
if x.isscalar():
self.shape = y.shape
elif y.isscalar():
self.shape = x.shape
elif x.dim == 1 and y.dim == 1:
if x.shape[0] != y.shape[0]:
raise ValueError('The dimensions of inputs should be equal, found %s and %s'
% (str(x.shape), str(y.shape)))
self.shape = ()
elif x.dim == 2 and y.dim == 2:
if x.shape[1] != y.shape[0]:
raise ValueError('The last dimension of the first input and the first dimension of the second input '
'should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
self.shape = (x.shape[0], y.shape[1])
elif y.dim == 1:
if x.shape[-1] != y.shape[0]:
raise ValueError('The last dimension of the first input and dimension of the second input '
'should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
self.shape = x.shape[:-1]
else:
if x.shape[-1] != y.shape[-2]:
raise ValueError('The last dimension of the first input and second-to-last dimension of the second '
'input should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
self.shape = x.shape[:-1] + y.shape[:-2] + (y.shape[-1],)
super(OpDot, self).__init__(**kwargs)
def _get_name(self) -> str:
return 'dot(%s, %s)' % (self.inputs[0].name, self.inputs[1].name)
def _get_op_name(self) -> str:
return 'dot(%s, %s)' % (self.inputs[0]._op_name, self.inputs[1]._op_name)
def _forward(self, feed_dict: Mapping[Union[str, OpPlaceholder], np.ndarray]) -> np.ndarray:
return np.dot(self.inputs[0].forward(feed_dict), self.inputs[1].forward(feed_dict))
def _backward(self, gradient: Operation) -> None:
x, y = self.inputs
if x.isscalar():
self.gradient = [
(gradient * y).sum(),
gradient * x,
]
elif y.isscalar():
self.gradient = [
gradient * y,
(gradient * x).sum(),
]
elif x.dim == 1 and y.dim == 1:
self.gradient = [
gradient * y,
gradient * x,
]
elif x.dim == 2 and y.dim == 2:
self.gradient = [
gradient.dot(y.transpose()),
x.transpose().dot(gradient),
]
elif y.dim == 1:
self.gradient = [
gradient.expand_dims(axis=-1).dot(y.expand_dims(axis=0)),
(x * gradient.expand_dims(axis=-1)).sum(axis=tuple(range(x.dim - 1))),
]
else:
zx_dim, zy_dim = x.dim - 1, y.dim - 2
self.gradient = [
gradient.sum(axis=tuple(range(zx_dim, zx_dim + zy_dim))).dot(
y.sum(axis=tuple(range(zy_dim))).transpose()
) * (1.0 / np.prod(y.shape[:zy_dim])),
gradient.sum(axis=tuple(range(zx_dim))).expand_dims(axis=-1).dot(
x.sum(axis=tuple(range(zx_dim))).expand_dims(axis=0)
).transpose(axes=tuple(range(zy_dim)) + (-1, -2)) * (1.0 / np.prod(x.shape[:zx_dim])),
]
x.backward(self.gradient[0])
y.backward(self.gradient[1])