auto import from //depot/cupcake/@135843
[android/platform/external/neven.git] / Embedded / common / src / b_BasicEm / MathSSE2.c
1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* ---- includes ----------------------------------------------------------- */
18
19 #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
20
21 #if ( defined( WIN64 ) || defined( HW_SSE2 ) )
22
23 #include "emmintrin.h"
24
25 /* disable warning "local variable 'x' used without having been initialized" */
26 #pragma warning( disable : 4700 )
27
28
29 /** Using half register (64-bit) in SSE2 to calculate dot product.
30  *  This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
31  *  Dependencies: input vectors need to be 16-bit aligned
32  *  Return Value: int32 containing resultL of dot product
33  */
34 int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
35 {
36         __m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
37         int16* vec1L = ( int16* )vec1A;
38         int16* vec2L = ( int16* )vec2A;
39
40         int32 resultL = 0;
41         uint32 alignOffSetL = 0;
42
43         /* initialize registers to 0 */
44         m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
45         m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
46         m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
47
48         alignOffSetL = sizeA % 16;
49         sizeA >>= 4;
50
51         if( sizeA )
52         {
53                 while( sizeA > 0 )
54                 {
55                         m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
56                         m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
57
58                         m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
59                         m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
60
61                         m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
62
63                         m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
64
65                         m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
66                         m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
67
68                         m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
69
70                         m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
71
72                         m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
73
74                         m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
75
76                         m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
77
78                         m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
79                         m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
80
81                         m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
82
83                         vec1L += 16;
84                         vec2L += 16;
85                         sizeA--;
86                 }
87
88                 /* sum up accumulators */
89                 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
90
91                 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
92
93                 m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
94
95                 m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
96
97                 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
98
99                 resultL = _mm_cvtsi128_si32( m_XMM7 );
100         }
101
102         /* switch statements produces faster code than loop */
103         switch( alignOffSetL )
104         {
105                 case 15:
106                         resultL += ( int32 )*vec1L++ * *vec2L++;
107                 case 14:
108                         resultL += ( int32 )*vec1L++ * *vec2L++;
109                 case 13:
110                         resultL += ( int32 )*vec1L++ * *vec2L++;
111                 case 12:
112                         resultL += ( int32 )*vec1L++ * *vec2L++;
113                 case 11:
114                         resultL += ( int32 )*vec1L++ * *vec2L++;
115                 case 10:
116                         resultL += ( int32 )*vec1L++ * *vec2L++;
117                 case 9:
118                         resultL += ( int32 )*vec1L++ * *vec2L++;
119                 case 8:
120                         resultL += ( int32 )*vec1L++ * *vec2L++;
121                 case 7:
122                         resultL += ( int32 )*vec1L++ * *vec2L++;
123                 case 6:
124                         resultL += ( int32 )*vec1L++ * *vec2L++;
125                 case 5:
126                         resultL += ( int32 )*vec1L++ * *vec2L++;
127                 case 4:
128                         resultL += ( int32 )*vec1L++ * *vec2L++;
129                 case 3:
130                         resultL += ( int32 )*vec1L++ * *vec2L++;
131                 case 2:
132                         resultL += ( int32 )*vec1L++ * *vec2L++;
133                 case 1:
134                         resultL += ( int32 )*vec1L++ * *vec2L++;
135         }
136
137         return resultL;
138 }
139
140 /* ------------------------------------------------------------------------- */
141
142 /** Using full register (128-bit) in SSE2 to calculate dot Product.
143  *  Dependencies: 16-bit aligned
144  *  Return Value: int32 containing dot Product
145  */
146 int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
147 {
148         __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
149         int16* vec1L = ( int16* )vec1A;
150         int16* vec2L = ( int16* )vec2A;
151
152         int32 resultL = 0;
153         uint32 alignOffSetL = 0;
154
155         m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
156         m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
157
158         alignOffSetL = sizeA % 16;
159         sizeA >>= 4;
160
161         if( sizeA )
162         {
163                 while( sizeA > 0 )
164                 {
165                         m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
166                         m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
167
168                         m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
169
170                         m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
171
172                         m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
173
174                         m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
175
176                         m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
177
178                         m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
179
180                         vec1L += 16;
181                         vec2L += 16;
182                         sizeA--;
183                 }
184
185                 /* sum up accumulators */
186                 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
187
188                 m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
189
190                 resultL = _mm_cvtsi128_si32( m_XMM0 );  /* 1st 32bits */
191
192                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
193
194                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
195
196                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
197
198                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
199
200                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
201
202                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
203         }
204
205         switch( alignOffSetL )
206         {
207                 case 15:
208                         resultL += ( int32 )*vec1L++ * *vec2L++;
209                 case 14:
210                         resultL += ( int32 )*vec1L++ * *vec2L++;
211                 case 13:
212                         resultL += ( int32 )*vec1L++ * *vec2L++;
213                 case 12:
214                         resultL += ( int32 )*vec1L++ * *vec2L++;
215                 case 11:
216                         resultL += ( int32 )*vec1L++ * *vec2L++;
217                 case 10:
218                         resultL += ( int32 )*vec1L++ * *vec2L++;
219                 case 9:
220                         resultL += ( int32 )*vec1L++ * *vec2L++;
221                 case 8:
222                         resultL += ( int32 )*vec1L++ * *vec2L++;
223                 case 7:
224                         resultL += ( int32 )*vec1L++ * *vec2L++;
225                 case 6:
226                         resultL += ( int32 )*vec1L++ * *vec2L++;
227                 case 5:
228                         resultL += ( int32 )*vec1L++ * *vec2L++;
229                 case 4:
230                         resultL += ( int32 )*vec1L++ * *vec2L++;
231                 case 3:
232                         resultL += ( int32 )*vec1L++ * *vec2L++;
233                 case 2:
234                         resultL += ( int32 )*vec1L++ * *vec2L++;
235                 case 1:
236                         resultL += ( int32 )*vec1L++ * *vec2L++;
237         }
238
239         return resultL;
240 }
241
242 /* ------------------------------------------------------------------------- */
243
244
245 /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
246  *  Dependencies: memory does not need to be 16-bit aligned
247  *  Return Value: int32 containing dot product
248  */
249 int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
250 {
251         __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
252         int16* vec1L = ( int16* )vec1A;
253         int16* vec2L = ( int16* )vec2A;
254         int32 resultL = 0;
255         uint32 alignOffSetL = 0;
256
257         /* initialize registers to 0 */
258         m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
259         m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
260
261
262         alignOffSetL = sizeA % 16;
263         sizeA >>= 4;
264
265         if( sizeA )
266         {
267                 while( sizeA > 0 )
268                 {
269                         m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
270                         m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
271
272                         m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
273
274                         m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
275
276                         m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
277
278                         m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
279
280                         m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
281
282                         m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
283
284                         vec1L += 16;
285                         vec2L += 16;
286                         sizeA--;
287                 }
288
289                 /* sum up accumulators */
290                 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
291                 
292                 m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
293
294                 resultL = _mm_cvtsi128_si32( m_XMM0 );  /* 1st 32bits */
295
296                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
297
298                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
299
300                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
301
302                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
303
304                 m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
305
306                 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
307         }
308
309
310         switch( alignOffSetL )
311         {
312                 case 15:
313                         resultL += ( int32 )*vec1L++ * *vec2L++;
314                 case 14:
315                         resultL += ( int32 )*vec1L++ * *vec2L++;
316                 case 13:
317                         resultL += ( int32 )*vec1L++ * *vec2L++;
318                 case 12:
319                         resultL += ( int32 )*vec1L++ * *vec2L++;
320                 case 11:
321                         resultL += ( int32 )*vec1L++ * *vec2L++;
322                 case 10:
323                         resultL += ( int32 )*vec1L++ * *vec2L++;
324                 case 9:
325                         resultL += ( int32 )*vec1L++ * *vec2L++;
326                 case 8:
327                         resultL += ( int32 )*vec1L++ * *vec2L++;
328                 case 7:
329                         resultL += ( int32 )*vec1L++ * *vec2L++;
330                 case 6:
331                         resultL += ( int32 )*vec1L++ * *vec2L++;
332                 case 5:
333                         resultL += ( int32 )*vec1L++ * *vec2L++;
334                 case 4:
335                         resultL += ( int32 )*vec1L++ * *vec2L++;
336                 case 3:
337                         resultL += ( int32 )*vec1L++ * *vec2L++;
338                 case 2:
339                         resultL += ( int32 )*vec1L++ * *vec2L++;
340                 case 1:
341                         resultL += ( int32 )*vec1L++ * *vec2L++;
342         }
343
344         return resultL;
345 }
346
347 /* ------------------------------------------------------------------------- */
348
349 #endif /* HW_SSE2 */