1 | // This file is part of Eigen, a lightweight C++ template library |
---|
2 | // for linear algebra. |
---|
3 | // |
---|
4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> |
---|
5 | // |
---|
6 | // This Source Code Form is subject to the terms of the Mozilla |
---|
7 | // Public License v. 2.0. If a copy of the MPL was not distributed |
---|
8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
---|
9 | |
---|
10 | #ifndef EIGEN_GENERAL_BLOCK_PANEL_H |
---|
11 | #define EIGEN_GENERAL_BLOCK_PANEL_H |
---|
12 | |
---|
13 | namespace Eigen { |
---|
14 | |
---|
15 | namespace internal { |
---|
16 | |
---|
17 | template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> |
---|
18 | class gebp_traits; |
---|
19 | |
---|
20 | |
---|
21 | /** \internal \returns b if a<=0, and returns a otherwise. */ |
---|
22 | inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) |
---|
23 | { |
---|
24 | return a<=0 ? b : a; |
---|
25 | } |
---|
26 | |
---|
27 | /** \internal */ |
---|
28 | inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) |
---|
29 | { |
---|
30 | static std::ptrdiff_t m_l1CacheSize = 0; |
---|
31 | static std::ptrdiff_t m_l2CacheSize = 0; |
---|
32 | if(m_l2CacheSize==0) |
---|
33 | { |
---|
34 | m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); |
---|
35 | m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); |
---|
36 | } |
---|
37 | |
---|
38 | if(action==SetAction) |
---|
39 | { |
---|
40 | // set the cpu cache size and cache all block sizes from a global cache size in byte |
---|
41 | eigen_internal_assert(l1!=0 && l2!=0); |
---|
42 | m_l1CacheSize = *l1; |
---|
43 | m_l2CacheSize = *l2; |
---|
44 | } |
---|
45 | else if(action==GetAction) |
---|
46 | { |
---|
47 | eigen_internal_assert(l1!=0 && l2!=0); |
---|
48 | *l1 = m_l1CacheSize; |
---|
49 | *l2 = m_l2CacheSize; |
---|
50 | } |
---|
51 | else |
---|
52 | { |
---|
53 | eigen_internal_assert(false); |
---|
54 | } |
---|
55 | } |
---|
56 | |
---|
57 | /** \brief Computes the blocking parameters for a m x k times k x n matrix product |
---|
58 | * |
---|
59 | * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. |
---|
60 | * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. |
---|
61 | * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. |
---|
62 | * |
---|
63 | * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, |
---|
64 | * this function computes the blocking size parameters along the respective dimensions |
---|
65 | * for matrix products and related algorithms. The blocking sizes depends on various |
---|
66 | * parameters: |
---|
67 | * - the L1 and L2 cache sizes, |
---|
68 | * - the register level blocking sizes defined by gebp_traits, |
---|
69 | * - the number of scalars that fit into a packet (when vectorization is enabled). |
---|
70 | * |
---|
71 | * \sa setCpuCacheSizes */ |
---|
72 | template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> |
---|
73 | void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) |
---|
74 | { |
---|
75 | EIGEN_UNUSED_VARIABLE(n); |
---|
76 | // Explanations: |
---|
77 | // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and |
---|
78 | // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed |
---|
79 | // per kc x nr vertical small panels where nr is the blocking size along the n dimension |
---|
80 | // at the register level. For vectorization purpose, these small vertical panels are unpacked, |
---|
81 | // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to |
---|
82 | // stay in L1 cache. |
---|
83 | std::ptrdiff_t l1, l2; |
---|
84 | |
---|
85 | typedef gebp_traits<LhsScalar,RhsScalar> Traits; |
---|
86 | enum { |
---|
87 | kdiv = KcFactor * 2 * Traits::nr |
---|
88 | * Traits::RhsProgress * sizeof(RhsScalar), |
---|
89 | mr = gebp_traits<LhsScalar,RhsScalar>::mr, |
---|
90 | mr_mask = (0xffffffff/mr)*mr |
---|
91 | }; |
---|
92 | |
---|
93 | manage_caching_sizes(GetAction, &l1, &l2); |
---|
94 | k = std::min<SizeType>(k, l1/kdiv); |
---|
95 | SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; |
---|
96 | if(_m<m) m = _m & mr_mask; |
---|
97 | } |
---|
98 | |
---|
99 | template<typename LhsScalar, typename RhsScalar, typename SizeType> |
---|
100 | inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) |
---|
101 | { |
---|
102 | computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); |
---|
103 | } |
---|
104 | |
---|
105 | #ifdef EIGEN_HAS_FUSE_CJMADD |
---|
106 | #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); |
---|
107 | #else |
---|
108 | |
---|
109 | // FIXME (a bit overkill maybe ?) |
---|
110 | |
---|
111 | template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { |
---|
112 | EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) |
---|
113 | { |
---|
114 | c = cj.pmadd(a,b,c); |
---|
115 | } |
---|
116 | }; |
---|
117 | |
---|
118 | template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { |
---|
119 | EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) |
---|
120 | { |
---|
121 | t = b; t = cj.pmul(a,t); c = padd(c,t); |
---|
122 | } |
---|
123 | }; |
---|
124 | |
---|
125 | template<typename CJ, typename A, typename B, typename C, typename T> |
---|
126 | EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) |
---|
127 | { |
---|
128 | gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); |
---|
129 | } |
---|
130 | |
---|
131 | #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); |
---|
132 | // #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); |
---|
133 | #endif |
---|
134 | |
---|
135 | /* Vectorization logic |
---|
136 | * real*real: unpack rhs to constant packets, ... |
---|
137 | * |
---|
138 | * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), |
---|
139 | * storing each res packet into two packets (2x2), |
---|
140 | * at the end combine them: swap the second and addsub them |
---|
141 | * cf*cf : same but with 2x4 blocks |
---|
142 | * cplx*real : unpack rhs to constant packets, ... |
---|
143 | * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual |
---|
144 | */ |
---|
145 | template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> |
---|
146 | class gebp_traits |
---|
147 | { |
---|
148 | public: |
---|
149 | typedef _LhsScalar LhsScalar; |
---|
150 | typedef _RhsScalar RhsScalar; |
---|
151 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; |
---|
152 | |
---|
153 | enum { |
---|
154 | ConjLhs = _ConjLhs, |
---|
155 | ConjRhs = _ConjRhs, |
---|
156 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, |
---|
157 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, |
---|
158 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, |
---|
159 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, |
---|
160 | |
---|
161 | NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, |
---|
162 | |
---|
163 | // register block size along the N direction (must be either 2 or 4) |
---|
164 | nr = NumberOfRegisters/4, |
---|
165 | |
---|
166 | // register block size along the M direction (currently, this one cannot be modified) |
---|
167 | mr = 2 * LhsPacketSize, |
---|
168 | |
---|
169 | WorkSpaceFactor = nr * RhsPacketSize, |
---|
170 | |
---|
171 | LhsProgress = LhsPacketSize, |
---|
172 | RhsProgress = RhsPacketSize |
---|
173 | }; |
---|
174 | |
---|
175 | typedef typename packet_traits<LhsScalar>::type _LhsPacket; |
---|
176 | typedef typename packet_traits<RhsScalar>::type _RhsPacket; |
---|
177 | typedef typename packet_traits<ResScalar>::type _ResPacket; |
---|
178 | |
---|
179 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; |
---|
180 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; |
---|
181 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; |
---|
182 | |
---|
183 | typedef ResPacket AccPacket; |
---|
184 | |
---|
185 | EIGEN_STRONG_INLINE void initAcc(AccPacket& p) |
---|
186 | { |
---|
187 | p = pset1<ResPacket>(ResScalar(0)); |
---|
188 | } |
---|
189 | |
---|
190 | EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) |
---|
191 | { |
---|
192 | for(DenseIndex k=0; k<n; k++) |
---|
193 | pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); |
---|
194 | } |
---|
195 | |
---|
196 | EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const |
---|
197 | { |
---|
198 | dest = pload<RhsPacket>(b); |
---|
199 | } |
---|
200 | |
---|
201 | EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const |
---|
202 | { |
---|
203 | dest = pload<LhsPacket>(a); |
---|
204 | } |
---|
205 | |
---|
206 | EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const |
---|
207 | { |
---|
208 | tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); |
---|
209 | } |
---|
210 | |
---|
211 | EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const |
---|
212 | { |
---|
213 | r = pmadd(c,alpha,r); |
---|
214 | } |
---|
215 | |
---|
216 | protected: |
---|
217 | // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; |
---|
218 | // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; |
---|
219 | }; |
---|
220 | |
---|
221 | template<typename RealScalar, bool _ConjLhs> |
---|
222 | class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> |
---|
223 | { |
---|
224 | public: |
---|
225 | typedef std::complex<RealScalar> LhsScalar; |
---|
226 | typedef RealScalar RhsScalar; |
---|
227 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; |
---|
228 | |
---|
229 | enum { |
---|
230 | ConjLhs = _ConjLhs, |
---|
231 | ConjRhs = false, |
---|
232 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, |
---|
233 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, |
---|
234 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, |
---|
235 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, |
---|
236 | |
---|
237 | NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, |
---|
238 | nr = NumberOfRegisters/4, |
---|
239 | mr = 2 * LhsPacketSize, |
---|
240 | WorkSpaceFactor = nr*RhsPacketSize, |
---|
241 | |
---|
242 | LhsProgress = LhsPacketSize, |
---|
243 | RhsProgress = RhsPacketSize |
---|
244 | }; |
---|
245 | |
---|
246 | typedef typename packet_traits<LhsScalar>::type _LhsPacket; |
---|
247 | typedef typename packet_traits<RhsScalar>::type _RhsPacket; |
---|
248 | typedef typename packet_traits<ResScalar>::type _ResPacket; |
---|
249 | |
---|
250 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; |
---|
251 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; |
---|
252 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; |
---|
253 | |
---|
254 | typedef ResPacket AccPacket; |
---|
255 | |
---|
256 | EIGEN_STRONG_INLINE void initAcc(AccPacket& p) |
---|
257 | { |
---|
258 | p = pset1<ResPacket>(ResScalar(0)); |
---|
259 | } |
---|
260 | |
---|
261 | EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) |
---|
262 | { |
---|
263 | for(DenseIndex k=0; k<n; k++) |
---|
264 | pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); |
---|
265 | } |
---|
266 | |
---|
267 | EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const |
---|
268 | { |
---|
269 | dest = pload<RhsPacket>(b); |
---|
270 | } |
---|
271 | |
---|
272 | EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const |
---|
273 | { |
---|
274 | dest = pload<LhsPacket>(a); |
---|
275 | } |
---|
276 | |
---|
277 | EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const |
---|
278 | { |
---|
279 | madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); |
---|
280 | } |
---|
281 | |
---|
282 | EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const |
---|
283 | { |
---|
284 | tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); |
---|
285 | } |
---|
286 | |
---|
287 | EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const |
---|
288 | { |
---|
289 | c += a * b; |
---|
290 | } |
---|
291 | |
---|
292 | EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const |
---|
293 | { |
---|
294 | r = cj.pmadd(c,alpha,r); |
---|
295 | } |
---|
296 | |
---|
297 | protected: |
---|
298 | conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; |
---|
299 | }; |
---|
300 | |
---|
301 | template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> |
---|
302 | class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > |
---|
303 | { |
---|
304 | public: |
---|
305 | typedef std::complex<RealScalar> Scalar; |
---|
306 | typedef std::complex<RealScalar> LhsScalar; |
---|
307 | typedef std::complex<RealScalar> RhsScalar; |
---|
308 | typedef std::complex<RealScalar> ResScalar; |
---|
309 | |
---|
310 | enum { |
---|
311 | ConjLhs = _ConjLhs, |
---|
312 | ConjRhs = _ConjRhs, |
---|
313 | Vectorizable = packet_traits<RealScalar>::Vectorizable |
---|
314 | && packet_traits<Scalar>::Vectorizable, |
---|
315 | RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, |
---|
316 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, |
---|
317 | |
---|
318 | nr = 2, |
---|
319 | mr = 2 * ResPacketSize, |
---|
320 | WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, |
---|
321 | |
---|
322 | LhsProgress = ResPacketSize, |
---|
323 | RhsProgress = Vectorizable ? 2*ResPacketSize : 1 |
---|
324 | }; |
---|
325 | |
---|
326 | typedef typename packet_traits<RealScalar>::type RealPacket; |
---|
327 | typedef typename packet_traits<Scalar>::type ScalarPacket; |
---|
328 | struct DoublePacket |
---|
329 | { |
---|
330 | RealPacket first; |
---|
331 | RealPacket second; |
---|
332 | }; |
---|
333 | |
---|
334 | typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; |
---|
335 | typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; |
---|
336 | typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; |
---|
337 | typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; |
---|
338 | |
---|
339 | EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } |
---|
340 | |
---|
341 | EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) |
---|
342 | { |
---|
343 | p.first = pset1<RealPacket>(RealScalar(0)); |
---|
344 | p.second = pset1<RealPacket>(RealScalar(0)); |
---|
345 | } |
---|
346 | |
---|
347 | /* Unpack the rhs coeff such that each complex coefficient is spread into |
---|
348 | * two packects containing respectively the real and imaginary coefficient |
---|
349 | * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] |
---|
350 | */ |
---|
351 | EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) |
---|
352 | { |
---|
353 | for(DenseIndex k=0; k<n; k++) |
---|
354 | { |
---|
355 | if(Vectorizable) |
---|
356 | { |
---|
357 | pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); |
---|
358 | pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); |
---|
359 | } |
---|
360 | else |
---|
361 | b[k] = rhs[k]; |
---|
362 | } |
---|
363 | } |
---|
364 | |
---|
365 | EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } |
---|
366 | |
---|
367 | EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const |
---|
368 | { |
---|
369 | dest.first = pload<RealPacket>((const RealScalar*)b); |
---|
370 | dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); |
---|
371 | } |
---|
372 | |
---|
373 | // nothing special here |
---|
374 | EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const |
---|
375 | { |
---|
376 | dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); |
---|
377 | } |
---|
378 | |
---|
379 | EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const |
---|
380 | { |
---|
381 | c.first = padd(pmul(a,b.first), c.first); |
---|
382 | c.second = padd(pmul(a,b.second),c.second); |
---|
383 | } |
---|
384 | |
---|
385 | EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const |
---|
386 | { |
---|
387 | c = cj.pmadd(a,b,c); |
---|
388 | } |
---|
389 | |
---|
390 | EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } |
---|
391 | |
---|
392 | EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const |
---|
393 | { |
---|
394 | // assemble c |
---|
395 | ResPacket tmp; |
---|
396 | if((!ConjLhs)&&(!ConjRhs)) |
---|
397 | { |
---|
398 | tmp = pcplxflip(pconj(ResPacket(c.second))); |
---|
399 | tmp = padd(ResPacket(c.first),tmp); |
---|
400 | } |
---|
401 | else if((!ConjLhs)&&(ConjRhs)) |
---|
402 | { |
---|
403 | tmp = pconj(pcplxflip(ResPacket(c.second))); |
---|
404 | tmp = padd(ResPacket(c.first),tmp); |
---|
405 | } |
---|
406 | else if((ConjLhs)&&(!ConjRhs)) |
---|
407 | { |
---|
408 | tmp = pcplxflip(ResPacket(c.second)); |
---|
409 | tmp = padd(pconj(ResPacket(c.first)),tmp); |
---|
410 | } |
---|
411 | else if((ConjLhs)&&(ConjRhs)) |
---|
412 | { |
---|
413 | tmp = pcplxflip(ResPacket(c.second)); |
---|
414 | tmp = psub(pconj(ResPacket(c.first)),tmp); |
---|
415 | } |
---|
416 | |
---|
417 | r = pmadd(tmp,alpha,r); |
---|
418 | } |
---|
419 | |
---|
420 | protected: |
---|
421 | conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; |
---|
422 | }; |
---|
423 | |
---|
424 | template<typename RealScalar, bool _ConjRhs> |
---|
425 | class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > |
---|
426 | { |
---|
427 | public: |
---|
428 | typedef std::complex<RealScalar> Scalar; |
---|
429 | typedef RealScalar LhsScalar; |
---|
430 | typedef Scalar RhsScalar; |
---|
431 | typedef Scalar ResScalar; |
---|
432 | |
---|
433 | enum { |
---|
434 | ConjLhs = false, |
---|
435 | ConjRhs = _ConjRhs, |
---|
436 | Vectorizable = packet_traits<RealScalar>::Vectorizable |
---|
437 | && packet_traits<Scalar>::Vectorizable, |
---|
438 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, |
---|
439 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, |
---|
440 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, |
---|
441 | |
---|
442 | NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, |
---|
443 | nr = 4, |
---|
444 | mr = 2*ResPacketSize, |
---|
445 | WorkSpaceFactor = nr*RhsPacketSize, |
---|
446 | |
---|
447 | LhsProgress = ResPacketSize, |
---|
448 | RhsProgress = ResPacketSize |
---|
449 | }; |
---|
450 | |
---|
451 | typedef typename packet_traits<LhsScalar>::type _LhsPacket; |
---|
452 | typedef typename packet_traits<RhsScalar>::type _RhsPacket; |
---|
453 | typedef typename packet_traits<ResScalar>::type _ResPacket; |
---|
454 | |
---|
455 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; |
---|
456 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; |
---|
457 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; |
---|
458 | |
---|
459 | typedef ResPacket AccPacket; |
---|
460 | |
---|
461 | EIGEN_STRONG_INLINE void initAcc(AccPacket& p) |
---|
462 | { |
---|
463 | p = pset1<ResPacket>(ResScalar(0)); |
---|
464 | } |
---|
465 | |
---|
466 | EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) |
---|
467 | { |
---|
468 | for(DenseIndex k=0; k<n; k++) |
---|
469 | pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); |
---|
470 | } |
---|
471 | |
---|
472 | EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const |
---|
473 | { |
---|
474 | dest = pload<RhsPacket>(b); |
---|
475 | } |
---|
476 | |
---|
477 | EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const |
---|
478 | { |
---|
479 | dest = ploaddup<LhsPacket>(a); |
---|
480 | } |
---|
481 | |
---|
482 | EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const |
---|
483 | { |
---|
484 | madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); |
---|
485 | } |
---|
486 | |
---|
487 | EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const |
---|
488 | { |
---|
489 | tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); |
---|
490 | } |
---|
491 | |
---|
492 | EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const |
---|
493 | { |
---|
494 | c += a * b; |
---|
495 | } |
---|
496 | |
---|
497 | EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const |
---|
498 | { |
---|
499 | r = cj.pmadd(alpha,c,r); |
---|
500 | } |
---|
501 | |
---|
502 | protected: |
---|
503 | conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; |
---|
504 | }; |
---|
505 | |
---|
506 | /* optimized GEneral packed Block * packed Panel product kernel |
---|
507 | * |
---|
508 | * Mixing type logic: C += A * B |
---|
509 | * | A | B | comments |
---|
510 | * |real |cplx | no vectorization yet, would require to pack A with duplication |
---|
511 | * |cplx |real | easy vectorization |
---|
512 | */ |
---|
513 | template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> |
---|
514 | struct gebp_kernel |
---|
515 | { |
---|
516 | typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; |
---|
517 | typedef typename Traits::ResScalar ResScalar; |
---|
518 | typedef typename Traits::LhsPacket LhsPacket; |
---|
519 | typedef typename Traits::RhsPacket RhsPacket; |
---|
520 | typedef typename Traits::ResPacket ResPacket; |
---|
521 | typedef typename Traits::AccPacket AccPacket; |
---|
522 | |
---|
523 | enum { |
---|
524 | Vectorizable = Traits::Vectorizable, |
---|
525 | LhsProgress = Traits::LhsProgress, |
---|
526 | RhsProgress = Traits::RhsProgress, |
---|
527 | ResPacketSize = Traits::ResPacketSize |
---|
528 | }; |
---|
529 | |
---|
530 | EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB |
---|
531 | void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, |
---|
532 | Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) |
---|
533 | { |
---|
534 | Traits traits; |
---|
535 | |
---|
536 | if(strideA==-1) strideA = depth; |
---|
537 | if(strideB==-1) strideB = depth; |
---|
538 | conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; |
---|
539 | // conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; |
---|
540 | Index packet_cols = (cols/nr) * nr; |
---|
541 | const Index peeled_mc = (rows/mr)*mr; |
---|
542 | // FIXME: |
---|
543 | const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); |
---|
544 | const Index peeled_kc = (depth/4)*4; |
---|
545 | |
---|
546 | if(unpackedB==0) |
---|
547 | unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); |
---|
548 | |
---|
549 | // loops on each micro vertical panel of rhs (depth x nr) |
---|
550 | for(Index j2=0; j2<packet_cols; j2+=nr) |
---|
551 | { |
---|
552 | traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); |
---|
553 | |
---|
554 | // loops on each largest micro horizontal panel of lhs (mr x depth) |
---|
555 | // => we select a mr x nr micro block of res which is entirely |
---|
556 | // stored into mr/packet_size x nr registers. |
---|
557 | for(Index i=0; i<peeled_mc; i+=mr) |
---|
558 | { |
---|
559 | const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; |
---|
560 | prefetch(&blA[0]); |
---|
561 | |
---|
562 | // gets res block as register |
---|
563 | AccPacket C0, C1, C2, C3, C4, C5, C6, C7; |
---|
564 | traits.initAcc(C0); |
---|
565 | traits.initAcc(C1); |
---|
566 | if(nr==4) traits.initAcc(C2); |
---|
567 | if(nr==4) traits.initAcc(C3); |
---|
568 | traits.initAcc(C4); |
---|
569 | traits.initAcc(C5); |
---|
570 | if(nr==4) traits.initAcc(C6); |
---|
571 | if(nr==4) traits.initAcc(C7); |
---|
572 | |
---|
573 | ResScalar* r0 = &res[(j2+0)*resStride + i]; |
---|
574 | ResScalar* r1 = r0 + resStride; |
---|
575 | ResScalar* r2 = r1 + resStride; |
---|
576 | ResScalar* r3 = r2 + resStride; |
---|
577 | |
---|
578 | prefetch(r0+16); |
---|
579 | prefetch(r1+16); |
---|
580 | prefetch(r2+16); |
---|
581 | prefetch(r3+16); |
---|
582 | |
---|
583 | // performs "inner" product |
---|
584 | // TODO let's check wether the folowing peeled loop could not be |
---|
585 | // optimized via optimal prefetching from one loop to the other |
---|
586 | const RhsScalar* blB = unpackedB; |
---|
587 | for(Index k=0; k<peeled_kc; k+=4) |
---|
588 | { |
---|
589 | if(nr==2) |
---|
590 | { |
---|
591 | LhsPacket A0, A1; |
---|
592 | RhsPacket B_0; |
---|
593 | RhsPacket T0; |
---|
594 | |
---|
595 | EIGEN_ASM_COMMENT("mybegin2"); |
---|
596 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
597 | traits.loadLhs(&blA[1*LhsProgress], A1); |
---|
598 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
599 | traits.madd(A0,B_0,C0,T0); |
---|
600 | traits.madd(A1,B_0,C4,B_0); |
---|
601 | traits.loadRhs(&blB[1*RhsProgress], B_0); |
---|
602 | traits.madd(A0,B_0,C1,T0); |
---|
603 | traits.madd(A1,B_0,C5,B_0); |
---|
604 | |
---|
605 | traits.loadLhs(&blA[2*LhsProgress], A0); |
---|
606 | traits.loadLhs(&blA[3*LhsProgress], A1); |
---|
607 | traits.loadRhs(&blB[2*RhsProgress], B_0); |
---|
608 | traits.madd(A0,B_0,C0,T0); |
---|
609 | traits.madd(A1,B_0,C4,B_0); |
---|
610 | traits.loadRhs(&blB[3*RhsProgress], B_0); |
---|
611 | traits.madd(A0,B_0,C1,T0); |
---|
612 | traits.madd(A1,B_0,C5,B_0); |
---|
613 | |
---|
614 | traits.loadLhs(&blA[4*LhsProgress], A0); |
---|
615 | traits.loadLhs(&blA[5*LhsProgress], A1); |
---|
616 | traits.loadRhs(&blB[4*RhsProgress], B_0); |
---|
617 | traits.madd(A0,B_0,C0,T0); |
---|
618 | traits.madd(A1,B_0,C4,B_0); |
---|
619 | traits.loadRhs(&blB[5*RhsProgress], B_0); |
---|
620 | traits.madd(A0,B_0,C1,T0); |
---|
621 | traits.madd(A1,B_0,C5,B_0); |
---|
622 | |
---|
623 | traits.loadLhs(&blA[6*LhsProgress], A0); |
---|
624 | traits.loadLhs(&blA[7*LhsProgress], A1); |
---|
625 | traits.loadRhs(&blB[6*RhsProgress], B_0); |
---|
626 | traits.madd(A0,B_0,C0,T0); |
---|
627 | traits.madd(A1,B_0,C4,B_0); |
---|
628 | traits.loadRhs(&blB[7*RhsProgress], B_0); |
---|
629 | traits.madd(A0,B_0,C1,T0); |
---|
630 | traits.madd(A1,B_0,C5,B_0); |
---|
631 | EIGEN_ASM_COMMENT("myend"); |
---|
632 | } |
---|
633 | else |
---|
634 | { |
---|
635 | EIGEN_ASM_COMMENT("mybegin4"); |
---|
636 | LhsPacket A0, A1; |
---|
637 | RhsPacket B_0, B1, B2, B3; |
---|
638 | RhsPacket T0; |
---|
639 | |
---|
640 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
641 | traits.loadLhs(&blA[1*LhsProgress], A1); |
---|
642 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
643 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
644 | |
---|
645 | traits.madd(A0,B_0,C0,T0); |
---|
646 | traits.loadRhs(&blB[2*RhsProgress], B2); |
---|
647 | traits.madd(A1,B_0,C4,B_0); |
---|
648 | traits.loadRhs(&blB[3*RhsProgress], B3); |
---|
649 | traits.loadRhs(&blB[4*RhsProgress], B_0); |
---|
650 | traits.madd(A0,B1,C1,T0); |
---|
651 | traits.madd(A1,B1,C5,B1); |
---|
652 | traits.loadRhs(&blB[5*RhsProgress], B1); |
---|
653 | traits.madd(A0,B2,C2,T0); |
---|
654 | traits.madd(A1,B2,C6,B2); |
---|
655 | traits.loadRhs(&blB[6*RhsProgress], B2); |
---|
656 | traits.madd(A0,B3,C3,T0); |
---|
657 | traits.loadLhs(&blA[2*LhsProgress], A0); |
---|
658 | traits.madd(A1,B3,C7,B3); |
---|
659 | traits.loadLhs(&blA[3*LhsProgress], A1); |
---|
660 | traits.loadRhs(&blB[7*RhsProgress], B3); |
---|
661 | traits.madd(A0,B_0,C0,T0); |
---|
662 | traits.madd(A1,B_0,C4,B_0); |
---|
663 | traits.loadRhs(&blB[8*RhsProgress], B_0); |
---|
664 | traits.madd(A0,B1,C1,T0); |
---|
665 | traits.madd(A1,B1,C5,B1); |
---|
666 | traits.loadRhs(&blB[9*RhsProgress], B1); |
---|
667 | traits.madd(A0,B2,C2,T0); |
---|
668 | traits.madd(A1,B2,C6,B2); |
---|
669 | traits.loadRhs(&blB[10*RhsProgress], B2); |
---|
670 | traits.madd(A0,B3,C3,T0); |
---|
671 | traits.loadLhs(&blA[4*LhsProgress], A0); |
---|
672 | traits.madd(A1,B3,C7,B3); |
---|
673 | traits.loadLhs(&blA[5*LhsProgress], A1); |
---|
674 | traits.loadRhs(&blB[11*RhsProgress], B3); |
---|
675 | |
---|
676 | traits.madd(A0,B_0,C0,T0); |
---|
677 | traits.madd(A1,B_0,C4,B_0); |
---|
678 | traits.loadRhs(&blB[12*RhsProgress], B_0); |
---|
679 | traits.madd(A0,B1,C1,T0); |
---|
680 | traits.madd(A1,B1,C5,B1); |
---|
681 | traits.loadRhs(&blB[13*RhsProgress], B1); |
---|
682 | traits.madd(A0,B2,C2,T0); |
---|
683 | traits.madd(A1,B2,C6,B2); |
---|
684 | traits.loadRhs(&blB[14*RhsProgress], B2); |
---|
685 | traits.madd(A0,B3,C3,T0); |
---|
686 | traits.loadLhs(&blA[6*LhsProgress], A0); |
---|
687 | traits.madd(A1,B3,C7,B3); |
---|
688 | traits.loadLhs(&blA[7*LhsProgress], A1); |
---|
689 | traits.loadRhs(&blB[15*RhsProgress], B3); |
---|
690 | traits.madd(A0,B_0,C0,T0); |
---|
691 | traits.madd(A1,B_0,C4,B_0); |
---|
692 | traits.madd(A0,B1,C1,T0); |
---|
693 | traits.madd(A1,B1,C5,B1); |
---|
694 | traits.madd(A0,B2,C2,T0); |
---|
695 | traits.madd(A1,B2,C6,B2); |
---|
696 | traits.madd(A0,B3,C3,T0); |
---|
697 | traits.madd(A1,B3,C7,B3); |
---|
698 | } |
---|
699 | |
---|
700 | blB += 4*nr*RhsProgress; |
---|
701 | blA += 4*mr; |
---|
702 | } |
---|
703 | // process remaining peeled loop |
---|
704 | for(Index k=peeled_kc; k<depth; k++) |
---|
705 | { |
---|
706 | if(nr==2) |
---|
707 | { |
---|
708 | LhsPacket A0, A1; |
---|
709 | RhsPacket B_0; |
---|
710 | RhsPacket T0; |
---|
711 | |
---|
712 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
713 | traits.loadLhs(&blA[1*LhsProgress], A1); |
---|
714 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
715 | traits.madd(A0,B_0,C0,T0); |
---|
716 | traits.madd(A1,B_0,C4,B_0); |
---|
717 | traits.loadRhs(&blB[1*RhsProgress], B_0); |
---|
718 | traits.madd(A0,B_0,C1,T0); |
---|
719 | traits.madd(A1,B_0,C5,B_0); |
---|
720 | } |
---|
721 | else |
---|
722 | { |
---|
723 | LhsPacket A0, A1; |
---|
724 | RhsPacket B_0, B1, B2, B3; |
---|
725 | RhsPacket T0; |
---|
726 | |
---|
727 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
728 | traits.loadLhs(&blA[1*LhsProgress], A1); |
---|
729 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
730 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
731 | |
---|
732 | traits.madd(A0,B_0,C0,T0); |
---|
733 | traits.loadRhs(&blB[2*RhsProgress], B2); |
---|
734 | traits.madd(A1,B_0,C4,B_0); |
---|
735 | traits.loadRhs(&blB[3*RhsProgress], B3); |
---|
736 | traits.madd(A0,B1,C1,T0); |
---|
737 | traits.madd(A1,B1,C5,B1); |
---|
738 | traits.madd(A0,B2,C2,T0); |
---|
739 | traits.madd(A1,B2,C6,B2); |
---|
740 | traits.madd(A0,B3,C3,T0); |
---|
741 | traits.madd(A1,B3,C7,B3); |
---|
742 | } |
---|
743 | |
---|
744 | blB += nr*RhsProgress; |
---|
745 | blA += mr; |
---|
746 | } |
---|
747 | |
---|
748 | if(nr==4) |
---|
749 | { |
---|
750 | ResPacket R0, R1, R2, R3, R4, R5, R6; |
---|
751 | ResPacket alphav = pset1<ResPacket>(alpha); |
---|
752 | |
---|
753 | R0 = ploadu<ResPacket>(r0); |
---|
754 | R1 = ploadu<ResPacket>(r1); |
---|
755 | R2 = ploadu<ResPacket>(r2); |
---|
756 | R3 = ploadu<ResPacket>(r3); |
---|
757 | R4 = ploadu<ResPacket>(r0 + ResPacketSize); |
---|
758 | R5 = ploadu<ResPacket>(r1 + ResPacketSize); |
---|
759 | R6 = ploadu<ResPacket>(r2 + ResPacketSize); |
---|
760 | traits.acc(C0, alphav, R0); |
---|
761 | pstoreu(r0, R0); |
---|
762 | R0 = ploadu<ResPacket>(r3 + ResPacketSize); |
---|
763 | |
---|
764 | traits.acc(C1, alphav, R1); |
---|
765 | traits.acc(C2, alphav, R2); |
---|
766 | traits.acc(C3, alphav, R3); |
---|
767 | traits.acc(C4, alphav, R4); |
---|
768 | traits.acc(C5, alphav, R5); |
---|
769 | traits.acc(C6, alphav, R6); |
---|
770 | traits.acc(C7, alphav, R0); |
---|
771 | |
---|
772 | pstoreu(r1, R1); |
---|
773 | pstoreu(r2, R2); |
---|
774 | pstoreu(r3, R3); |
---|
775 | pstoreu(r0 + ResPacketSize, R4); |
---|
776 | pstoreu(r1 + ResPacketSize, R5); |
---|
777 | pstoreu(r2 + ResPacketSize, R6); |
---|
778 | pstoreu(r3 + ResPacketSize, R0); |
---|
779 | } |
---|
780 | else |
---|
781 | { |
---|
782 | ResPacket R0, R1, R4; |
---|
783 | ResPacket alphav = pset1<ResPacket>(alpha); |
---|
784 | |
---|
785 | R0 = ploadu<ResPacket>(r0); |
---|
786 | R1 = ploadu<ResPacket>(r1); |
---|
787 | R4 = ploadu<ResPacket>(r0 + ResPacketSize); |
---|
788 | traits.acc(C0, alphav, R0); |
---|
789 | pstoreu(r0, R0); |
---|
790 | R0 = ploadu<ResPacket>(r1 + ResPacketSize); |
---|
791 | traits.acc(C1, alphav, R1); |
---|
792 | traits.acc(C4, alphav, R4); |
---|
793 | traits.acc(C5, alphav, R0); |
---|
794 | pstoreu(r1, R1); |
---|
795 | pstoreu(r0 + ResPacketSize, R4); |
---|
796 | pstoreu(r1 + ResPacketSize, R0); |
---|
797 | } |
---|
798 | |
---|
799 | } |
---|
800 | |
---|
801 | if(rows-peeled_mc>=LhsProgress) |
---|
802 | { |
---|
803 | Index i = peeled_mc; |
---|
804 | const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; |
---|
805 | prefetch(&blA[0]); |
---|
806 | |
---|
807 | // gets res block as register |
---|
808 | AccPacket C0, C1, C2, C3; |
---|
809 | traits.initAcc(C0); |
---|
810 | traits.initAcc(C1); |
---|
811 | if(nr==4) traits.initAcc(C2); |
---|
812 | if(nr==4) traits.initAcc(C3); |
---|
813 | |
---|
814 | // performs "inner" product |
---|
815 | const RhsScalar* blB = unpackedB; |
---|
816 | for(Index k=0; k<peeled_kc; k+=4) |
---|
817 | { |
---|
818 | if(nr==2) |
---|
819 | { |
---|
820 | LhsPacket A0; |
---|
821 | RhsPacket B_0, B1; |
---|
822 | |
---|
823 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
824 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
825 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
826 | traits.madd(A0,B_0,C0,B_0); |
---|
827 | traits.loadRhs(&blB[2*RhsProgress], B_0); |
---|
828 | traits.madd(A0,B1,C1,B1); |
---|
829 | traits.loadLhs(&blA[1*LhsProgress], A0); |
---|
830 | traits.loadRhs(&blB[3*RhsProgress], B1); |
---|
831 | traits.madd(A0,B_0,C0,B_0); |
---|
832 | traits.loadRhs(&blB[4*RhsProgress], B_0); |
---|
833 | traits.madd(A0,B1,C1,B1); |
---|
834 | traits.loadLhs(&blA[2*LhsProgress], A0); |
---|
835 | traits.loadRhs(&blB[5*RhsProgress], B1); |
---|
836 | traits.madd(A0,B_0,C0,B_0); |
---|
837 | traits.loadRhs(&blB[6*RhsProgress], B_0); |
---|
838 | traits.madd(A0,B1,C1,B1); |
---|
839 | traits.loadLhs(&blA[3*LhsProgress], A0); |
---|
840 | traits.loadRhs(&blB[7*RhsProgress], B1); |
---|
841 | traits.madd(A0,B_0,C0,B_0); |
---|
842 | traits.madd(A0,B1,C1,B1); |
---|
843 | } |
---|
844 | else |
---|
845 | { |
---|
846 | LhsPacket A0; |
---|
847 | RhsPacket B_0, B1, B2, B3; |
---|
848 | |
---|
849 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
850 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
851 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
852 | |
---|
853 | traits.madd(A0,B_0,C0,B_0); |
---|
854 | traits.loadRhs(&blB[2*RhsProgress], B2); |
---|
855 | traits.loadRhs(&blB[3*RhsProgress], B3); |
---|
856 | traits.loadRhs(&blB[4*RhsProgress], B_0); |
---|
857 | traits.madd(A0,B1,C1,B1); |
---|
858 | traits.loadRhs(&blB[5*RhsProgress], B1); |
---|
859 | traits.madd(A0,B2,C2,B2); |
---|
860 | traits.loadRhs(&blB[6*RhsProgress], B2); |
---|
861 | traits.madd(A0,B3,C3,B3); |
---|
862 | traits.loadLhs(&blA[1*LhsProgress], A0); |
---|
863 | traits.loadRhs(&blB[7*RhsProgress], B3); |
---|
864 | traits.madd(A0,B_0,C0,B_0); |
---|
865 | traits.loadRhs(&blB[8*RhsProgress], B_0); |
---|
866 | traits.madd(A0,B1,C1,B1); |
---|
867 | traits.loadRhs(&blB[9*RhsProgress], B1); |
---|
868 | traits.madd(A0,B2,C2,B2); |
---|
869 | traits.loadRhs(&blB[10*RhsProgress], B2); |
---|
870 | traits.madd(A0,B3,C3,B3); |
---|
871 | traits.loadLhs(&blA[2*LhsProgress], A0); |
---|
872 | traits.loadRhs(&blB[11*RhsProgress], B3); |
---|
873 | |
---|
874 | traits.madd(A0,B_0,C0,B_0); |
---|
875 | traits.loadRhs(&blB[12*RhsProgress], B_0); |
---|
876 | traits.madd(A0,B1,C1,B1); |
---|
877 | traits.loadRhs(&blB[13*RhsProgress], B1); |
---|
878 | traits.madd(A0,B2,C2,B2); |
---|
879 | traits.loadRhs(&blB[14*RhsProgress], B2); |
---|
880 | traits.madd(A0,B3,C3,B3); |
---|
881 | |
---|
882 | traits.loadLhs(&blA[3*LhsProgress], A0); |
---|
883 | traits.loadRhs(&blB[15*RhsProgress], B3); |
---|
884 | traits.madd(A0,B_0,C0,B_0); |
---|
885 | traits.madd(A0,B1,C1,B1); |
---|
886 | traits.madd(A0,B2,C2,B2); |
---|
887 | traits.madd(A0,B3,C3,B3); |
---|
888 | } |
---|
889 | |
---|
890 | blB += nr*4*RhsProgress; |
---|
891 | blA += 4*LhsProgress; |
---|
892 | } |
---|
893 | // process remaining peeled loop |
---|
894 | for(Index k=peeled_kc; k<depth; k++) |
---|
895 | { |
---|
896 | if(nr==2) |
---|
897 | { |
---|
898 | LhsPacket A0; |
---|
899 | RhsPacket B_0, B1; |
---|
900 | |
---|
901 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
902 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
903 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
904 | traits.madd(A0,B_0,C0,B_0); |
---|
905 | traits.madd(A0,B1,C1,B1); |
---|
906 | } |
---|
907 | else |
---|
908 | { |
---|
909 | LhsPacket A0; |
---|
910 | RhsPacket B_0, B1, B2, B3; |
---|
911 | |
---|
912 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
913 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
914 | traits.loadRhs(&blB[1*RhsProgress], B1); |
---|
915 | traits.loadRhs(&blB[2*RhsProgress], B2); |
---|
916 | traits.loadRhs(&blB[3*RhsProgress], B3); |
---|
917 | |
---|
918 | traits.madd(A0,B_0,C0,B_0); |
---|
919 | traits.madd(A0,B1,C1,B1); |
---|
920 | traits.madd(A0,B2,C2,B2); |
---|
921 | traits.madd(A0,B3,C3,B3); |
---|
922 | } |
---|
923 | |
---|
924 | blB += nr*RhsProgress; |
---|
925 | blA += LhsProgress; |
---|
926 | } |
---|
927 | |
---|
928 | ResPacket R0, R1, R2, R3; |
---|
929 | ResPacket alphav = pset1<ResPacket>(alpha); |
---|
930 | |
---|
931 | ResScalar* r0 = &res[(j2+0)*resStride + i]; |
---|
932 | ResScalar* r1 = r0 + resStride; |
---|
933 | ResScalar* r2 = r1 + resStride; |
---|
934 | ResScalar* r3 = r2 + resStride; |
---|
935 | |
---|
936 | R0 = ploadu<ResPacket>(r0); |
---|
937 | R1 = ploadu<ResPacket>(r1); |
---|
938 | if(nr==4) R2 = ploadu<ResPacket>(r2); |
---|
939 | if(nr==4) R3 = ploadu<ResPacket>(r3); |
---|
940 | |
---|
941 | traits.acc(C0, alphav, R0); |
---|
942 | traits.acc(C1, alphav, R1); |
---|
943 | if(nr==4) traits.acc(C2, alphav, R2); |
---|
944 | if(nr==4) traits.acc(C3, alphav, R3); |
---|
945 | |
---|
946 | pstoreu(r0, R0); |
---|
947 | pstoreu(r1, R1); |
---|
948 | if(nr==4) pstoreu(r2, R2); |
---|
949 | if(nr==4) pstoreu(r3, R3); |
---|
950 | } |
---|
951 | for(Index i=peeled_mc2; i<rows; i++) |
---|
952 | { |
---|
953 | const LhsScalar* blA = &blockA[i*strideA+offsetA]; |
---|
954 | prefetch(&blA[0]); |
---|
955 | |
---|
956 | // gets a 1 x nr res block as registers |
---|
957 | ResScalar C0(0), C1(0), C2(0), C3(0); |
---|
958 | // TODO directly use blockB ??? |
---|
959 | const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; |
---|
960 | for(Index k=0; k<depth; k++) |
---|
961 | { |
---|
962 | if(nr==2) |
---|
963 | { |
---|
964 | LhsScalar A0; |
---|
965 | RhsScalar B_0, B1; |
---|
966 | |
---|
967 | A0 = blA[k]; |
---|
968 | B_0 = blB[0]; |
---|
969 | B1 = blB[1]; |
---|
970 | MADD(cj,A0,B_0,C0,B_0); |
---|
971 | MADD(cj,A0,B1,C1,B1); |
---|
972 | } |
---|
973 | else |
---|
974 | { |
---|
975 | LhsScalar A0; |
---|
976 | RhsScalar B_0, B1, B2, B3; |
---|
977 | |
---|
978 | A0 = blA[k]; |
---|
979 | B_0 = blB[0]; |
---|
980 | B1 = blB[1]; |
---|
981 | B2 = blB[2]; |
---|
982 | B3 = blB[3]; |
---|
983 | |
---|
984 | MADD(cj,A0,B_0,C0,B_0); |
---|
985 | MADD(cj,A0,B1,C1,B1); |
---|
986 | MADD(cj,A0,B2,C2,B2); |
---|
987 | MADD(cj,A0,B3,C3,B3); |
---|
988 | } |
---|
989 | |
---|
990 | blB += nr; |
---|
991 | } |
---|
992 | res[(j2+0)*resStride + i] += alpha*C0; |
---|
993 | res[(j2+1)*resStride + i] += alpha*C1; |
---|
994 | if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; |
---|
995 | if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; |
---|
996 | } |
---|
997 | } |
---|
998 | // process remaining rhs/res columns one at a time |
---|
999 | // => do the same but with nr==1 |
---|
1000 | for(Index j2=packet_cols; j2<cols; j2++) |
---|
1001 | { |
---|
1002 | // unpack B |
---|
1003 | traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); |
---|
1004 | |
---|
1005 | for(Index i=0; i<peeled_mc; i+=mr) |
---|
1006 | { |
---|
1007 | const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; |
---|
1008 | prefetch(&blA[0]); |
---|
1009 | |
---|
1010 | // TODO move the res loads to the stores |
---|
1011 | |
---|
1012 | // get res block as registers |
---|
1013 | AccPacket C0, C4; |
---|
1014 | traits.initAcc(C0); |
---|
1015 | traits.initAcc(C4); |
---|
1016 | |
---|
1017 | const RhsScalar* blB = unpackedB; |
---|
1018 | for(Index k=0; k<depth; k++) |
---|
1019 | { |
---|
1020 | LhsPacket A0, A1; |
---|
1021 | RhsPacket B_0; |
---|
1022 | RhsPacket T0; |
---|
1023 | |
---|
1024 | traits.loadLhs(&blA[0*LhsProgress], A0); |
---|
1025 | traits.loadLhs(&blA[1*LhsProgress], A1); |
---|
1026 | traits.loadRhs(&blB[0*RhsProgress], B_0); |
---|
1027 | traits.madd(A0,B_0,C0,T0); |
---|
1028 | traits.madd(A1,B_0,C4,B_0); |
---|
1029 | |
---|
1030 | blB += RhsProgress; |
---|
1031 | blA += 2*LhsProgress; |
---|
1032 | } |
---|
1033 | ResPacket R0, R4; |
---|
1034 | ResPacket alphav = pset1<ResPacket>(alpha); |
---|
1035 | |
---|
1036 | ResScalar* r0 = &res[(j2+0)*resStride + i]; |
---|
1037 | |
---|
1038 | R0 = ploadu<ResPacket>(r0); |
---|
1039 | R4 = ploadu<ResPacket>(r0+ResPacketSize); |
---|
1040 | |
---|
1041 | traits.acc(C0, alphav, R0); |
---|
1042 | traits.acc(C4, alphav, R4); |
---|
1043 | |
---|
1044 | pstoreu(r0, R0); |
---|
1045 | pstoreu(r0+ResPacketSize, R4); |
---|
1046 | } |
---|
1047 | if(rows-peeled_mc>=LhsProgress) |
---|
1048 | { |
---|
1049 | Index i = peeled_mc; |
---|
1050 | const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; |
---|
1051 | prefetch(&blA[0]); |
---|
1052 | |
---|
1053 | AccPacket C0; |
---|
1054 | traits.initAcc(C0); |
---|
1055 | |
---|
1056 | const RhsScalar* blB = unpackedB; |
---|
1057 | for(Index k=0; k<depth; k++) |
---|
1058 | { |
---|
1059 | LhsPacket A0; |
---|
1060 | RhsPacket B_0; |
---|
1061 | traits.loadLhs(blA, A0); |
---|
1062 | traits.loadRhs(blB, B_0); |
---|
1063 | traits.madd(A0, B_0, C0, B_0); |
---|
1064 | blB += RhsProgress; |
---|
1065 | blA += LhsProgress; |
---|
1066 | } |
---|
1067 | |
---|
1068 | ResPacket alphav = pset1<ResPacket>(alpha); |
---|
1069 | ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); |
---|
1070 | traits.acc(C0, alphav, R0); |
---|
1071 | pstoreu(&res[(j2+0)*resStride + i], R0); |
---|
1072 | } |
---|
1073 | for(Index i=peeled_mc2; i<rows; i++) |
---|
1074 | { |
---|
1075 | const LhsScalar* blA = &blockA[i*strideA+offsetA]; |
---|
1076 | prefetch(&blA[0]); |
---|
1077 | |
---|
1078 | // gets a 1 x 1 res block as registers |
---|
1079 | ResScalar C0(0); |
---|
1080 | // FIXME directly use blockB ?? |
---|
1081 | const RhsScalar* blB = &blockB[j2*strideB+offsetB]; |
---|
1082 | for(Index k=0; k<depth; k++) |
---|
1083 | { |
---|
1084 | LhsScalar A0 = blA[k]; |
---|
1085 | RhsScalar B_0 = blB[k]; |
---|
1086 | MADD(cj, A0, B_0, C0, B_0); |
---|
1087 | } |
---|
1088 | res[(j2+0)*resStride + i] += alpha*C0; |
---|
1089 | } |
---|
1090 | } |
---|
1091 | } |
---|
1092 | }; |
---|
1093 | |
---|
1094 | #undef CJMADD |
---|
1095 | |
---|
1096 | // pack a block of the lhs |
---|
1097 | // The traversal is as follow (mr==4): |
---|
1098 | // 0 4 8 12 ... |
---|
1099 | // 1 5 9 13 ... |
---|
1100 | // 2 6 10 14 ... |
---|
1101 | // 3 7 11 15 ... |
---|
1102 | // |
---|
1103 | // 16 20 24 28 ... |
---|
1104 | // 17 21 25 29 ... |
---|
1105 | // 18 22 26 30 ... |
---|
1106 | // 19 23 27 31 ... |
---|
1107 | // |
---|
1108 | // 32 33 34 35 ... |
---|
1109 | // 36 36 38 39 ... |
---|
1110 | template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> |
---|
1111 | struct gemm_pack_lhs |
---|
1112 | { |
---|
1113 | EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, |
---|
1114 | Index stride=0, Index offset=0) |
---|
1115 | { |
---|
1116 | typedef typename packet_traits<Scalar>::type Packet; |
---|
1117 | enum { PacketSize = packet_traits<Scalar>::size }; |
---|
1118 | |
---|
1119 | EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); |
---|
1120 | eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); |
---|
1121 | eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); |
---|
1122 | conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; |
---|
1123 | const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); |
---|
1124 | Index count = 0; |
---|
1125 | Index peeled_mc = (rows/Pack1)*Pack1; |
---|
1126 | for(Index i=0; i<peeled_mc; i+=Pack1) |
---|
1127 | { |
---|
1128 | if(PanelMode) count += Pack1 * offset; |
---|
1129 | |
---|
1130 | if(StorageOrder==ColMajor) |
---|
1131 | { |
---|
1132 | for(Index k=0; k<depth; k++) |
---|
1133 | { |
---|
1134 | Packet A, B, C, D; |
---|
1135 | if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); |
---|
1136 | if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); |
---|
1137 | if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); |
---|
1138 | if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); |
---|
1139 | if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } |
---|
1140 | if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } |
---|
1141 | if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } |
---|
1142 | if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } |
---|
1143 | } |
---|
1144 | } |
---|
1145 | else |
---|
1146 | { |
---|
1147 | for(Index k=0; k<depth; k++) |
---|
1148 | { |
---|
1149 | // TODO add a vectorized transpose here |
---|
1150 | Index w=0; |
---|
1151 | for(; w<Pack1-3; w+=4) |
---|
1152 | { |
---|
1153 | Scalar a(cj(lhs(i+w+0, k))), |
---|
1154 | b(cj(lhs(i+w+1, k))), |
---|
1155 | c(cj(lhs(i+w+2, k))), |
---|
1156 | d(cj(lhs(i+w+3, k))); |
---|
1157 | blockA[count++] = a; |
---|
1158 | blockA[count++] = b; |
---|
1159 | blockA[count++] = c; |
---|
1160 | blockA[count++] = d; |
---|
1161 | } |
---|
1162 | if(Pack1%4) |
---|
1163 | for(;w<Pack1;++w) |
---|
1164 | blockA[count++] = cj(lhs(i+w, k)); |
---|
1165 | } |
---|
1166 | } |
---|
1167 | if(PanelMode) count += Pack1 * (stride-offset-depth); |
---|
1168 | } |
---|
1169 | if(rows-peeled_mc>=Pack2) |
---|
1170 | { |
---|
1171 | if(PanelMode) count += Pack2*offset; |
---|
1172 | for(Index k=0; k<depth; k++) |
---|
1173 | for(Index w=0; w<Pack2; w++) |
---|
1174 | blockA[count++] = cj(lhs(peeled_mc+w, k)); |
---|
1175 | if(PanelMode) count += Pack2 * (stride-offset-depth); |
---|
1176 | peeled_mc += Pack2; |
---|
1177 | } |
---|
1178 | for(Index i=peeled_mc; i<rows; i++) |
---|
1179 | { |
---|
1180 | if(PanelMode) count += offset; |
---|
1181 | for(Index k=0; k<depth; k++) |
---|
1182 | blockA[count++] = cj(lhs(i, k)); |
---|
1183 | if(PanelMode) count += (stride-offset-depth); |
---|
1184 | } |
---|
1185 | } |
---|
1186 | }; |
---|
1187 | |
---|
1188 | // copy a complete panel of the rhs |
---|
1189 | // this version is optimized for column major matrices |
---|
1190 | // The traversal order is as follow: (nr==4): |
---|
1191 | // 0 1 2 3 12 13 14 15 24 27 |
---|
1192 | // 4 5 6 7 16 17 18 19 25 28 |
---|
1193 | // 8 9 10 11 20 21 22 23 26 29 |
---|
1194 | // . . . . . . . . . . |
---|
1195 | template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
---|
1196 | struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> |
---|
1197 | { |
---|
1198 | typedef typename packet_traits<Scalar>::type Packet; |
---|
1199 | enum { PacketSize = packet_traits<Scalar>::size }; |
---|
1200 | EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, |
---|
1201 | Index stride=0, Index offset=0) |
---|
1202 | { |
---|
1203 | EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); |
---|
1204 | eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); |
---|
1205 | conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; |
---|
1206 | Index packet_cols = (cols/nr) * nr; |
---|
1207 | Index count = 0; |
---|
1208 | for(Index j2=0; j2<packet_cols; j2+=nr) |
---|
1209 | { |
---|
1210 | // skip what we have before |
---|
1211 | if(PanelMode) count += nr * offset; |
---|
1212 | const Scalar* b0 = &rhs[(j2+0)*rhsStride]; |
---|
1213 | const Scalar* b1 = &rhs[(j2+1)*rhsStride]; |
---|
1214 | const Scalar* b2 = &rhs[(j2+2)*rhsStride]; |
---|
1215 | const Scalar* b3 = &rhs[(j2+3)*rhsStride]; |
---|
1216 | for(Index k=0; k<depth; k++) |
---|
1217 | { |
---|
1218 | blockB[count+0] = cj(b0[k]); |
---|
1219 | blockB[count+1] = cj(b1[k]); |
---|
1220 | if(nr==4) blockB[count+2] = cj(b2[k]); |
---|
1221 | if(nr==4) blockB[count+3] = cj(b3[k]); |
---|
1222 | count += nr; |
---|
1223 | } |
---|
1224 | // skip what we have after |
---|
1225 | if(PanelMode) count += nr * (stride-offset-depth); |
---|
1226 | } |
---|
1227 | |
---|
1228 | // copy the remaining columns one at a time (nr==1) |
---|
1229 | for(Index j2=packet_cols; j2<cols; ++j2) |
---|
1230 | { |
---|
1231 | if(PanelMode) count += offset; |
---|
1232 | const Scalar* b0 = &rhs[(j2+0)*rhsStride]; |
---|
1233 | for(Index k=0; k<depth; k++) |
---|
1234 | { |
---|
1235 | blockB[count] = cj(b0[k]); |
---|
1236 | count += 1; |
---|
1237 | } |
---|
1238 | if(PanelMode) count += (stride-offset-depth); |
---|
1239 | } |
---|
1240 | } |
---|
1241 | }; |
---|
1242 | |
---|
1243 | // this version is optimized for row major matrices |
---|
1244 | template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
---|
1245 | struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> |
---|
1246 | { |
---|
1247 | enum { PacketSize = packet_traits<Scalar>::size }; |
---|
1248 | EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, |
---|
1249 | Index stride=0, Index offset=0) |
---|
1250 | { |
---|
1251 | EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); |
---|
1252 | eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); |
---|
1253 | conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; |
---|
1254 | Index packet_cols = (cols/nr) * nr; |
---|
1255 | Index count = 0; |
---|
1256 | for(Index j2=0; j2<packet_cols; j2+=nr) |
---|
1257 | { |
---|
1258 | // skip what we have before |
---|
1259 | if(PanelMode) count += nr * offset; |
---|
1260 | for(Index k=0; k<depth; k++) |
---|
1261 | { |
---|
1262 | const Scalar* b0 = &rhs[k*rhsStride + j2]; |
---|
1263 | blockB[count+0] = cj(b0[0]); |
---|
1264 | blockB[count+1] = cj(b0[1]); |
---|
1265 | if(nr==4) blockB[count+2] = cj(b0[2]); |
---|
1266 | if(nr==4) blockB[count+3] = cj(b0[3]); |
---|
1267 | count += nr; |
---|
1268 | } |
---|
1269 | // skip what we have after |
---|
1270 | if(PanelMode) count += nr * (stride-offset-depth); |
---|
1271 | } |
---|
1272 | // copy the remaining columns one at a time (nr==1) |
---|
1273 | for(Index j2=packet_cols; j2<cols; ++j2) |
---|
1274 | { |
---|
1275 | if(PanelMode) count += offset; |
---|
1276 | const Scalar* b0 = &rhs[j2]; |
---|
1277 | for(Index k=0; k<depth; k++) |
---|
1278 | { |
---|
1279 | blockB[count] = cj(b0[k*rhsStride]); |
---|
1280 | count += 1; |
---|
1281 | } |
---|
1282 | if(PanelMode) count += stride-offset-depth; |
---|
1283 | } |
---|
1284 | } |
---|
1285 | }; |
---|
1286 | |
---|
1287 | } // end namespace internal |
---|
1288 | |
---|
1289 | /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. |
---|
1290 | * \sa setCpuCacheSize */ |
---|
1291 | inline std::ptrdiff_t l1CacheSize() |
---|
1292 | { |
---|
1293 | std::ptrdiff_t l1, l2; |
---|
1294 | internal::manage_caching_sizes(GetAction, &l1, &l2); |
---|
1295 | return l1; |
---|
1296 | } |
---|
1297 | |
---|
1298 | /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. |
---|
1299 | * \sa setCpuCacheSize */ |
---|
1300 | inline std::ptrdiff_t l2CacheSize() |
---|
1301 | { |
---|
1302 | std::ptrdiff_t l1, l2; |
---|
1303 | internal::manage_caching_sizes(GetAction, &l1, &l2); |
---|
1304 | return l2; |
---|
1305 | } |
---|
1306 | |
---|
1307 | /** Set the cpu L1 and L2 cache sizes (in bytes). |
---|
1308 | * These values are use to adjust the size of the blocks |
---|
1309 | * for the algorithms working per blocks. |
---|
1310 | * |
---|
1311 | * \sa computeProductBlockingSizes */ |
---|
1312 | inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) |
---|
1313 | { |
---|
1314 | internal::manage_caching_sizes(SetAction, &l1, &l2); |
---|
1315 | } |
---|
1316 | |
---|
1317 | } // end namespace Eigen |
---|
1318 | |
---|
1319 | #endif // EIGEN_GENERAL_BLOCK_PANEL_H |
---|