1 | // This file is part of Eigen, a lightweight C++ template library |
---|
2 | // for linear algebra. |
---|
3 | // |
---|
4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> |
---|
5 | // |
---|
6 | // This Source Code Form is subject to the terms of the Mozilla |
---|
7 | // Public License v. 2.0. If a copy of the MPL was not distributed |
---|
8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
---|
9 | |
---|
10 | #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H |
---|
11 | #define EIGEN_GENERAL_MATRIX_VECTOR_H |
---|
12 | |
---|
13 | namespace Eigen { |
---|
14 | |
---|
15 | namespace internal { |
---|
16 | |
---|
17 | /* Optimized col-major matrix * vector product: |
---|
18 | * This algorithm processes 4 columns at onces that allows to both reduce |
---|
19 | * the number of load/stores of the result by a factor 4 and to reduce |
---|
20 | * the instruction dependency. Moreover, we know that all bands have the |
---|
21 | * same alignment pattern. |
---|
22 | * |
---|
23 | * Mixing type logic: C += alpha * A * B |
---|
24 | * | A | B |alpha| comments |
---|
25 | * |real |cplx |cplx | no vectorization |
---|
26 | * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization |
---|
27 | * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp |
---|
28 | * |cplx |real |real | optimal case, vectorization possible via real-cplx mul |
---|
29 | */ |
---|
30 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> |
---|
31 | struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> |
---|
32 | { |
---|
33 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; |
---|
34 | |
---|
35 | enum { |
---|
36 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable |
---|
37 | && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), |
---|
38 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, |
---|
39 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, |
---|
40 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 |
---|
41 | }; |
---|
42 | |
---|
43 | typedef typename packet_traits<LhsScalar>::type _LhsPacket; |
---|
44 | typedef typename packet_traits<RhsScalar>::type _RhsPacket; |
---|
45 | typedef typename packet_traits<ResScalar>::type _ResPacket; |
---|
46 | |
---|
47 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; |
---|
48 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; |
---|
49 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; |
---|
50 | |
---|
51 | EIGEN_DONT_INLINE static void run( |
---|
52 | Index rows, Index cols, |
---|
53 | const LhsScalar* lhs, Index lhsStride, |
---|
54 | const RhsScalar* rhs, Index rhsIncr, |
---|
55 | ResScalar* res, Index |
---|
56 | #ifdef EIGEN_INTERNAL_DEBUGGING |
---|
57 | resIncr |
---|
58 | #endif |
---|
59 | , RhsScalar alpha) |
---|
60 | { |
---|
61 | eigen_internal_assert(resIncr==1); |
---|
62 | #ifdef _EIGEN_ACCUMULATE_PACKETS |
---|
63 | #error _EIGEN_ACCUMULATE_PACKETS has already been defined |
---|
64 | #endif |
---|
65 | #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ |
---|
66 | pstore(&res[j], \ |
---|
67 | padd(pload<ResPacket>(&res[j]), \ |
---|
68 | padd( \ |
---|
69 | padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ |
---|
70 | pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ |
---|
71 | padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ |
---|
72 | pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) |
---|
73 | |
---|
74 | conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; |
---|
75 | conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; |
---|
76 | if(ConjugateRhs) |
---|
77 | alpha = conj(alpha); |
---|
78 | |
---|
79 | enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; |
---|
80 | const Index columnsAtOnce = 4; |
---|
81 | const Index peels = 2; |
---|
82 | const Index LhsPacketAlignedMask = LhsPacketSize-1; |
---|
83 | const Index ResPacketAlignedMask = ResPacketSize-1; |
---|
84 | const Index size = rows; |
---|
85 | |
---|
86 | // How many coeffs of the result do we have to skip to be aligned. |
---|
87 | // Here we assume data are at least aligned on the base scalar type. |
---|
88 | Index alignedStart = internal::first_aligned(res,size); |
---|
89 | Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; |
---|
90 | const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; |
---|
91 | |
---|
92 | const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; |
---|
93 | Index alignmentPattern = alignmentStep==0 ? AllAligned |
---|
94 | : alignmentStep==(LhsPacketSize/2) ? EvenAligned |
---|
95 | : FirstAligned; |
---|
96 | |
---|
97 | // we cannot assume the first element is aligned because of sub-matrices |
---|
98 | const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); |
---|
99 | |
---|
100 | // find how many columns do we have to skip to be aligned with the result (if possible) |
---|
101 | Index skipColumns = 0; |
---|
102 | // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) |
---|
103 | if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) |
---|
104 | { |
---|
105 | alignedSize = 0; |
---|
106 | alignedStart = 0; |
---|
107 | } |
---|
108 | else if (LhsPacketSize>1) |
---|
109 | { |
---|
110 | eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); |
---|
111 | |
---|
112 | while (skipColumns<LhsPacketSize && |
---|
113 | alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) |
---|
114 | ++skipColumns; |
---|
115 | if (skipColumns==LhsPacketSize) |
---|
116 | { |
---|
117 | // nothing can be aligned, no need to skip any column |
---|
118 | alignmentPattern = NoneAligned; |
---|
119 | skipColumns = 0; |
---|
120 | } |
---|
121 | else |
---|
122 | { |
---|
123 | skipColumns = (std::min)(skipColumns,cols); |
---|
124 | // note that the skiped columns are processed later. |
---|
125 | } |
---|
126 | |
---|
127 | eigen_internal_assert( (alignmentPattern==NoneAligned) |
---|
128 | || (skipColumns + columnsAtOnce >= cols) |
---|
129 | || LhsPacketSize > size |
---|
130 | || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); |
---|
131 | } |
---|
132 | else if(Vectorizable) |
---|
133 | { |
---|
134 | alignedStart = 0; |
---|
135 | alignedSize = size; |
---|
136 | alignmentPattern = AllAligned; |
---|
137 | } |
---|
138 | |
---|
139 | Index offset1 = (FirstAligned && alignmentStep==1?3:1); |
---|
140 | Index offset3 = (FirstAligned && alignmentStep==1?1:3); |
---|
141 | |
---|
142 | Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; |
---|
143 | for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) |
---|
144 | { |
---|
145 | RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), |
---|
146 | ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), |
---|
147 | ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), |
---|
148 | ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); |
---|
149 | |
---|
150 | // this helps a lot generating better binary code |
---|
151 | const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, |
---|
152 | *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; |
---|
153 | |
---|
154 | if (Vectorizable) |
---|
155 | { |
---|
156 | /* explicit vectorization */ |
---|
157 | // process initial unaligned coeffs |
---|
158 | for (Index j=0; j<alignedStart; ++j) |
---|
159 | { |
---|
160 | res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); |
---|
161 | res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); |
---|
162 | res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); |
---|
163 | res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); |
---|
164 | } |
---|
165 | |
---|
166 | if (alignedSize>alignedStart) |
---|
167 | { |
---|
168 | switch(alignmentPattern) |
---|
169 | { |
---|
170 | case AllAligned: |
---|
171 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) |
---|
172 | _EIGEN_ACCUMULATE_PACKETS(d,d,d); |
---|
173 | break; |
---|
174 | case EvenAligned: |
---|
175 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) |
---|
176 | _EIGEN_ACCUMULATE_PACKETS(d,du,d); |
---|
177 | break; |
---|
178 | case FirstAligned: |
---|
179 | { |
---|
180 | Index j = alignedStart; |
---|
181 | if(peels>1) |
---|
182 | { |
---|
183 | LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; |
---|
184 | ResPacket T0, T1; |
---|
185 | |
---|
186 | A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); |
---|
187 | A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); |
---|
188 | A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); |
---|
189 | |
---|
190 | for (; j<peeledSize; j+=peels*ResPacketSize) |
---|
191 | { |
---|
192 | A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); |
---|
193 | A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); |
---|
194 | A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); |
---|
195 | |
---|
196 | A00 = pload<LhsPacket>(&lhs0[j]); |
---|
197 | A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); |
---|
198 | T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); |
---|
199 | T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); |
---|
200 | |
---|
201 | T0 = pcj.pmadd(A01, ptmp1, T0); |
---|
202 | A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); |
---|
203 | T0 = pcj.pmadd(A02, ptmp2, T0); |
---|
204 | A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); |
---|
205 | T0 = pcj.pmadd(A03, ptmp3, T0); |
---|
206 | pstore(&res[j],T0); |
---|
207 | A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); |
---|
208 | T1 = pcj.pmadd(A11, ptmp1, T1); |
---|
209 | T1 = pcj.pmadd(A12, ptmp2, T1); |
---|
210 | T1 = pcj.pmadd(A13, ptmp3, T1); |
---|
211 | pstore(&res[j+ResPacketSize],T1); |
---|
212 | } |
---|
213 | } |
---|
214 | for (; j<alignedSize; j+=ResPacketSize) |
---|
215 | _EIGEN_ACCUMULATE_PACKETS(d,du,du); |
---|
216 | break; |
---|
217 | } |
---|
218 | default: |
---|
219 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) |
---|
220 | _EIGEN_ACCUMULATE_PACKETS(du,du,du); |
---|
221 | break; |
---|
222 | } |
---|
223 | } |
---|
224 | } // end explicit vectorization |
---|
225 | |
---|
226 | /* process remaining coeffs (or all if there is no explicit vectorization) */ |
---|
227 | for (Index j=alignedSize; j<size; ++j) |
---|
228 | { |
---|
229 | res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); |
---|
230 | res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); |
---|
231 | res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); |
---|
232 | res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); |
---|
233 | } |
---|
234 | } |
---|
235 | |
---|
236 | // process remaining first and last columns (at most columnsAtOnce-1) |
---|
237 | Index end = cols; |
---|
238 | Index start = columnBound; |
---|
239 | do |
---|
240 | { |
---|
241 | for (Index k=start; k<end; ++k) |
---|
242 | { |
---|
243 | RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); |
---|
244 | const LhsScalar* lhs0 = lhs + k*lhsStride; |
---|
245 | |
---|
246 | if (Vectorizable) |
---|
247 | { |
---|
248 | /* explicit vectorization */ |
---|
249 | // process first unaligned result's coeffs |
---|
250 | for (Index j=0; j<alignedStart; ++j) |
---|
251 | res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); |
---|
252 | // process aligned result's coeffs |
---|
253 | if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) |
---|
254 | for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) |
---|
255 | pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); |
---|
256 | else |
---|
257 | for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) |
---|
258 | pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); |
---|
259 | } |
---|
260 | |
---|
261 | // process remaining scalars (or all if no explicit vectorization) |
---|
262 | for (Index i=alignedSize; i<size; ++i) |
---|
263 | res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); |
---|
264 | } |
---|
265 | if (skipColumns) |
---|
266 | { |
---|
267 | start = 0; |
---|
268 | end = skipColumns; |
---|
269 | skipColumns = 0; |
---|
270 | } |
---|
271 | else |
---|
272 | break; |
---|
273 | } while(Vectorizable); |
---|
274 | #undef _EIGEN_ACCUMULATE_PACKETS |
---|
275 | } |
---|
276 | }; |
---|
277 | |
---|
278 | /* Optimized row-major matrix * vector product: |
---|
279 | * This algorithm processes 4 rows at onces that allows to both reduce |
---|
280 | * the number of load/stores of the result by a factor 4 and to reduce |
---|
281 | * the instruction dependency. Moreover, we know that all bands have the |
---|
282 | * same alignment pattern. |
---|
283 | * |
---|
284 | * Mixing type logic: |
---|
285 | * - alpha is always a complex (or converted to a complex) |
---|
286 | * - no vectorization |
---|
287 | */ |
---|
288 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> |
---|
289 | struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> |
---|
290 | { |
---|
291 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; |
---|
292 | |
---|
293 | enum { |
---|
294 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable |
---|
295 | && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), |
---|
296 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, |
---|
297 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, |
---|
298 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 |
---|
299 | }; |
---|
300 | |
---|
301 | typedef typename packet_traits<LhsScalar>::type _LhsPacket; |
---|
302 | typedef typename packet_traits<RhsScalar>::type _RhsPacket; |
---|
303 | typedef typename packet_traits<ResScalar>::type _ResPacket; |
---|
304 | |
---|
305 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; |
---|
306 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; |
---|
307 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; |
---|
308 | |
---|
309 | EIGEN_DONT_INLINE static void run( |
---|
310 | Index rows, Index cols, |
---|
311 | const LhsScalar* lhs, Index lhsStride, |
---|
312 | const RhsScalar* rhs, Index rhsIncr, |
---|
313 | ResScalar* res, Index resIncr, |
---|
314 | ResScalar alpha) |
---|
315 | { |
---|
316 | EIGEN_UNUSED_VARIABLE(rhsIncr); |
---|
317 | eigen_internal_assert(rhsIncr==1); |
---|
318 | #ifdef _EIGEN_ACCUMULATE_PACKETS |
---|
319 | #error _EIGEN_ACCUMULATE_PACKETS has already been defined |
---|
320 | #endif |
---|
321 | |
---|
322 | #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ |
---|
323 | RhsPacket b = pload<RhsPacket>(&rhs[j]); \ |
---|
324 | ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ |
---|
325 | ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ |
---|
326 | ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ |
---|
327 | ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } |
---|
328 | |
---|
329 | conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; |
---|
330 | conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; |
---|
331 | |
---|
332 | enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; |
---|
333 | const Index rowsAtOnce = 4; |
---|
334 | const Index peels = 2; |
---|
335 | const Index RhsPacketAlignedMask = RhsPacketSize-1; |
---|
336 | const Index LhsPacketAlignedMask = LhsPacketSize-1; |
---|
337 | const Index depth = cols; |
---|
338 | |
---|
339 | // How many coeffs of the result do we have to skip to be aligned. |
---|
340 | // Here we assume data are at least aligned on the base scalar type |
---|
341 | // if that's not the case then vectorization is discarded, see below. |
---|
342 | Index alignedStart = internal::first_aligned(rhs, depth); |
---|
343 | Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; |
---|
344 | const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; |
---|
345 | |
---|
346 | const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; |
---|
347 | Index alignmentPattern = alignmentStep==0 ? AllAligned |
---|
348 | : alignmentStep==(LhsPacketSize/2) ? EvenAligned |
---|
349 | : FirstAligned; |
---|
350 | |
---|
351 | // we cannot assume the first element is aligned because of sub-matrices |
---|
352 | const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); |
---|
353 | |
---|
354 | // find how many rows do we have to skip to be aligned with rhs (if possible) |
---|
355 | Index skipRows = 0; |
---|
356 | // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) |
---|
357 | if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) |
---|
358 | { |
---|
359 | alignedSize = 0; |
---|
360 | alignedStart = 0; |
---|
361 | } |
---|
362 | else if (LhsPacketSize>1) |
---|
363 | { |
---|
364 | eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); |
---|
365 | |
---|
366 | while (skipRows<LhsPacketSize && |
---|
367 | alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) |
---|
368 | ++skipRows; |
---|
369 | if (skipRows==LhsPacketSize) |
---|
370 | { |
---|
371 | // nothing can be aligned, no need to skip any column |
---|
372 | alignmentPattern = NoneAligned; |
---|
373 | skipRows = 0; |
---|
374 | } |
---|
375 | else |
---|
376 | { |
---|
377 | skipRows = (std::min)(skipRows,Index(rows)); |
---|
378 | // note that the skiped columns are processed later. |
---|
379 | } |
---|
380 | eigen_internal_assert( alignmentPattern==NoneAligned |
---|
381 | || LhsPacketSize==1 |
---|
382 | || (skipRows + rowsAtOnce >= rows) |
---|
383 | || LhsPacketSize > depth |
---|
384 | || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); |
---|
385 | } |
---|
386 | else if(Vectorizable) |
---|
387 | { |
---|
388 | alignedStart = 0; |
---|
389 | alignedSize = depth; |
---|
390 | alignmentPattern = AllAligned; |
---|
391 | } |
---|
392 | |
---|
393 | Index offset1 = (FirstAligned && alignmentStep==1?3:1); |
---|
394 | Index offset3 = (FirstAligned && alignmentStep==1?1:3); |
---|
395 | |
---|
396 | Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; |
---|
397 | for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) |
---|
398 | { |
---|
399 | EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); |
---|
400 | ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); |
---|
401 | |
---|
402 | // this helps the compiler generating good binary code |
---|
403 | const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, |
---|
404 | *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; |
---|
405 | |
---|
406 | if (Vectorizable) |
---|
407 | { |
---|
408 | /* explicit vectorization */ |
---|
409 | ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), |
---|
410 | ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); |
---|
411 | |
---|
412 | // process initial unaligned coeffs |
---|
413 | // FIXME this loop get vectorized by the compiler ! |
---|
414 | for (Index j=0; j<alignedStart; ++j) |
---|
415 | { |
---|
416 | RhsScalar b = rhs[j]; |
---|
417 | tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); |
---|
418 | tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); |
---|
419 | } |
---|
420 | |
---|
421 | if (alignedSize>alignedStart) |
---|
422 | { |
---|
423 | switch(alignmentPattern) |
---|
424 | { |
---|
425 | case AllAligned: |
---|
426 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) |
---|
427 | _EIGEN_ACCUMULATE_PACKETS(d,d,d); |
---|
428 | break; |
---|
429 | case EvenAligned: |
---|
430 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) |
---|
431 | _EIGEN_ACCUMULATE_PACKETS(d,du,d); |
---|
432 | break; |
---|
433 | case FirstAligned: |
---|
434 | { |
---|
435 | Index j = alignedStart; |
---|
436 | if (peels>1) |
---|
437 | { |
---|
438 | /* Here we proccess 4 rows with with two peeled iterations to hide |
---|
439 | * the overhead of unaligned loads. Moreover unaligned loads are handled |
---|
440 | * using special shift/move operations between the two aligned packets |
---|
441 | * overlaping the desired unaligned packet. This is *much* more efficient |
---|
442 | * than basic unaligned loads. |
---|
443 | */ |
---|
444 | LhsPacket A01, A02, A03, A11, A12, A13; |
---|
445 | A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); |
---|
446 | A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); |
---|
447 | A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); |
---|
448 | |
---|
449 | for (; j<peeledSize; j+=peels*RhsPacketSize) |
---|
450 | { |
---|
451 | RhsPacket b = pload<RhsPacket>(&rhs[j]); |
---|
452 | A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); |
---|
453 | A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); |
---|
454 | A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); |
---|
455 | |
---|
456 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); |
---|
457 | ptmp1 = pcj.pmadd(A01, b, ptmp1); |
---|
458 | A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); |
---|
459 | ptmp2 = pcj.pmadd(A02, b, ptmp2); |
---|
460 | A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); |
---|
461 | ptmp3 = pcj.pmadd(A03, b, ptmp3); |
---|
462 | A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); |
---|
463 | |
---|
464 | b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); |
---|
465 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); |
---|
466 | ptmp1 = pcj.pmadd(A11, b, ptmp1); |
---|
467 | ptmp2 = pcj.pmadd(A12, b, ptmp2); |
---|
468 | ptmp3 = pcj.pmadd(A13, b, ptmp3); |
---|
469 | } |
---|
470 | } |
---|
471 | for (; j<alignedSize; j+=RhsPacketSize) |
---|
472 | _EIGEN_ACCUMULATE_PACKETS(d,du,du); |
---|
473 | break; |
---|
474 | } |
---|
475 | default: |
---|
476 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) |
---|
477 | _EIGEN_ACCUMULATE_PACKETS(du,du,du); |
---|
478 | break; |
---|
479 | } |
---|
480 | tmp0 += predux(ptmp0); |
---|
481 | tmp1 += predux(ptmp1); |
---|
482 | tmp2 += predux(ptmp2); |
---|
483 | tmp3 += predux(ptmp3); |
---|
484 | } |
---|
485 | } // end explicit vectorization |
---|
486 | |
---|
487 | // process remaining coeffs (or all if no explicit vectorization) |
---|
488 | // FIXME this loop get vectorized by the compiler ! |
---|
489 | for (Index j=alignedSize; j<depth; ++j) |
---|
490 | { |
---|
491 | RhsScalar b = rhs[j]; |
---|
492 | tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); |
---|
493 | tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); |
---|
494 | } |
---|
495 | res[i*resIncr] += alpha*tmp0; |
---|
496 | res[(i+offset1)*resIncr] += alpha*tmp1; |
---|
497 | res[(i+2)*resIncr] += alpha*tmp2; |
---|
498 | res[(i+offset3)*resIncr] += alpha*tmp3; |
---|
499 | } |
---|
500 | |
---|
501 | // process remaining first and last rows (at most columnsAtOnce-1) |
---|
502 | Index end = rows; |
---|
503 | Index start = rowBound; |
---|
504 | do |
---|
505 | { |
---|
506 | for (Index i=start; i<end; ++i) |
---|
507 | { |
---|
508 | EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); |
---|
509 | ResPacket ptmp0 = pset1<ResPacket>(tmp0); |
---|
510 | const LhsScalar* lhs0 = lhs + i*lhsStride; |
---|
511 | // process first unaligned result's coeffs |
---|
512 | // FIXME this loop get vectorized by the compiler ! |
---|
513 | for (Index j=0; j<alignedStart; ++j) |
---|
514 | tmp0 += cj.pmul(lhs0[j], rhs[j]); |
---|
515 | |
---|
516 | if (alignedSize>alignedStart) |
---|
517 | { |
---|
518 | // process aligned rhs coeffs |
---|
519 | if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) |
---|
520 | for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) |
---|
521 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); |
---|
522 | else |
---|
523 | for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) |
---|
524 | ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); |
---|
525 | tmp0 += predux(ptmp0); |
---|
526 | } |
---|
527 | |
---|
528 | // process remaining scalars |
---|
529 | // FIXME this loop get vectorized by the compiler ! |
---|
530 | for (Index j=alignedSize; j<depth; ++j) |
---|
531 | tmp0 += cj.pmul(lhs0[j], rhs[j]); |
---|
532 | res[i*resIncr] += alpha*tmp0; |
---|
533 | } |
---|
534 | if (skipRows) |
---|
535 | { |
---|
536 | start = 0; |
---|
537 | end = skipRows; |
---|
538 | skipRows = 0; |
---|
539 | } |
---|
540 | else |
---|
541 | break; |
---|
542 | } while(Vectorizable); |
---|
543 | |
---|
544 | #undef _EIGEN_ACCUMULATE_PACKETS |
---|
545 | } |
---|
546 | }; |
---|
547 | |
---|
548 | } // end namespace internal |
---|
549 | |
---|
550 | } // end namespace Eigen |
---|
551 | |
---|
552 | #endif // EIGEN_GENERAL_MATRIX_VECTOR_H |
---|