191 lines
8.9 KiB
C++
191 lines
8.9 KiB
C++
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#undef DLIB_CCA_AbSTRACT_Hh_
|
|
#ifdef DLIB_CCA_AbSTRACT_Hh_
|
|
|
|
#include "../matrix/matrix_la_abstract.h"
|
|
#include "random_subset_selector_abstract.h"
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
matrix<typename T::type,0,1> compute_correlations (
|
|
const matrix_exp<T>& L,
|
|
const matrix_exp<T>& R
|
|
);
|
|
/*!
|
|
requires
|
|
- L.size() > 0
|
|
- R.size() > 0
|
|
- L.nr() == R.nr()
|
|
ensures
|
|
- This function treats L and R as sequences of paired row vectors. It
|
|
then computes the correlation values between the elements of these
|
|
row vectors. In particular, we return a vector COR such that:
|
|
- COR.size() == L.nc()
|
|
- for all valid i:
|
|
- COR(i) == the correlation coefficient between the following sequence
|
|
of paired numbers: (L(k,i), R(k,i)) for k: 0 <= k < L.nr().
|
|
Therefore, COR(i) is a value between -1 and 1 inclusive where 1
|
|
indicates perfect correlation and -1 perfect anti-correlation. Note
|
|
that this function assumes the input data vectors have been centered
|
|
(i.e. made to have zero mean). If this is not the case then it will
|
|
report inaccurate results.
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename T
|
|
>
|
|
matrix<T,0,1> cca (
|
|
const matrix<T>& L,
|
|
const matrix<T>& R,
|
|
matrix<T>& Ltrans,
|
|
matrix<T>& Rtrans,
|
|
unsigned long num_correlations,
|
|
unsigned long extra_rank = 5,
|
|
unsigned long q = 2,
|
|
double regularization = 0
|
|
);
|
|
/*!
|
|
requires
|
|
- num_correlations > 0
|
|
- L.size() > 0
|
|
- R.size() > 0
|
|
- L.nr() == R.nr()
|
|
- regularization >= 0
|
|
ensures
|
|
- This function performs a canonical correlation analysis between the row
|
|
vectors in L and R. That is, it finds two transformation matrices, Ltrans
|
|
and Rtrans, such that row vectors in the transformed matrices L*Ltrans and
|
|
R*Rtrans are as correlated as possible. That is, we try to find two transforms
|
|
such that the correlation values returned by compute_correlations(L*Ltrans, R*Rtrans)
|
|
would be maximized.
|
|
- Let N == min(num_correlations, min(R.nr(),min(L.nc(),R.nc())))
|
|
(This is the actual number of elements in the transformed vectors.
|
|
Therefore, note that you can't get more outputs than there are rows or
|
|
columns in the input matrices.)
|
|
- #Ltrans.nr() == L.nc()
|
|
- #Ltrans.nc() == N
|
|
- #Rtrans.nr() == R.nc()
|
|
- #Rtrans.nc() == N
|
|
- This function assumes the data vectors in L and R have already been centered
|
|
(i.e. we assume the vectors have zero means). However, in many cases it is
|
|
fine to use uncentered data with cca(). But if it is important for your
|
|
problem then you should center your data before passing it to cca().
|
|
- This function works with reduced rank approximations of the L and R matrices.
|
|
This makes it fast when working with large matrices. In particular, we use
|
|
the svd_fast() routine to find reduced rank representations of the input
|
|
matrices by calling it as follows: svd_fast(L, U,D,V, num_correlations+extra_rank, q)
|
|
and similarly for R. This means that you can use the extra_rank and q
|
|
arguments to cca() to influence the accuracy of the reduced rank
|
|
approximation. However, the default values should work fine for most
|
|
problems.
|
|
- returns an estimate of compute_correlations(L*#Ltrans, R*#Rtrans). The
|
|
returned vector should exactly match the output of compute_correlations()
|
|
when the reduced rank approximation to L and R is accurate and regularization
|
|
is set to 0. However, if this is not the case then the return value of this
|
|
function will deviate from compute_correlations(L*#Ltrans, R*#Rtrans). This
|
|
deviation can be used to check if the reduced rank approximation is working
|
|
or you need to increase extra_rank.
|
|
- The dimensions of the output vectors produced by L*#Ltrans or R*#Rtrans are
|
|
ordered such that the dimensions with the highest correlations come first.
|
|
That is, after applying the transforms produced by cca() to a set of vectors
|
|
you will find that dimension 0 has the highest correlation, then dimension 1
|
|
has the next highest, and so on. This also means that the list of numbers
|
|
returned from cca() will always be listed in decreasing order.
|
|
- This function performs the ridge regression version of Canonical Correlation
|
|
Analysis when regularization is set to a value > 0. In particular, larger
|
|
values indicate the solution should be more heavily regularized. This can be
|
|
useful when the dimensionality of the data is larger than the number of
|
|
samples.
|
|
- A good discussion of CCA can be found in the paper "Canonical Correlation
|
|
Analysis" by David Weenink. In particular, this function is implemented
|
|
using equations 29 and 30 from his paper. We also use the idea of doing CCA
|
|
on a reduced rank approximation of L and R as suggested by Paramveer S.
|
|
Dhillon in his paper "Two Step CCA: A new spectral method for estimating
|
|
vector models of words".
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename sparse_vector_type,
|
|
typename T
|
|
>
|
|
matrix<T,0,1> cca (
|
|
const std::vector<sparse_vector_type>& L,
|
|
const std::vector<sparse_vector_type>& R,
|
|
matrix<T>& Ltrans,
|
|
matrix<T>& Rtrans,
|
|
unsigned long num_correlations,
|
|
unsigned long extra_rank = 5,
|
|
unsigned long q = 2,
|
|
double regularization = 0
|
|
);
|
|
/*!
|
|
requires
|
|
- num_correlations > 0
|
|
- L.size() == R.size()
|
|
- max_index_plus_one(L) > 0 && max_index_plus_one(R) > 0
|
|
(i.e. L and R can't represent empty matrices)
|
|
- L and R must contain sparse vectors (see the top of dlib/svm/sparse_vector_abstract.h
|
|
for a definition of sparse vector)
|
|
- regularization >= 0
|
|
ensures
|
|
- This is just an overload of the cca() function defined above. Except in this
|
|
case we take a sparse representation of the input L and R matrices rather than
|
|
dense matrices. Therefore, in this case, we interpret L and R as matrices
|
|
with L.size() rows, where each row is defined by a sparse vector. So this
|
|
function does exactly the same thing as the above cca().
|
|
- Note that you can apply the output transforms to a sparse vector with the
|
|
following code:
|
|
sparse_matrix_vector_multiply(trans(Ltrans), your_sparse_vector)
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <
|
|
typename sparse_vector_type,
|
|
typename Rand_type,
|
|
typename T
|
|
>
|
|
matrix<T,0,1> cca (
|
|
const random_subset_selector<sparse_vector_type,Rand_type>& L,
|
|
const random_subset_selector<sparse_vector_type,Rand_type>& R,
|
|
matrix<T>& Ltrans,
|
|
matrix<T>& Rtrans,
|
|
unsigned long num_correlations,
|
|
unsigned long extra_rank = 5,
|
|
unsigned long q = 2,
|
|
double regularization = 0
|
|
);
|
|
/*!
|
|
requires
|
|
- num_correlations > 0
|
|
- L.size() == R.size()
|
|
- max_index_plus_one(L) > 0 && max_index_plus_one(R) > 0
|
|
(i.e. L and R can't represent empty matrices)
|
|
- L and R must contain sparse vectors (see the top of dlib/svm/sparse_vector_abstract.h
|
|
for a definition of sparse vector)
|
|
- regularization >= 0
|
|
ensures
|
|
- returns cca(L.to_std_vector(), R.to_std_vector(), Ltrans, Rtrans, num_correlations, extra_rank, q)
|
|
(i.e. this is just a convenience function for calling the cca() routine when
|
|
your sparse vectors are contained inside a random_subset_selector rather than
|
|
a std::vector)
|
|
!*/
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_CCA_AbSTRACT_Hh_
|
|
|
|
|