auto import from //depot/cupcake/@135843
[android/platform/external/neven.git] / Embedded / common / src / b_TensorEm / CompactMat.c
1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* ---- includes ----------------------------------------------------------- */
18
19 #include "b_TensorEm/CompactMat.h"
20 #include "b_TensorEm/Functions.h"
21 #include "b_BasicEm/Math.h"
22 #include "b_BasicEm/Functions.h"
23 #include "b_BasicEm/Memory.h"
24
25 /* ------------------------------------------------------------------------- */
26
27 /* ========================================================================= */
28 /*                                                                           */
29 /* ---- \ghd{ auxiliary functions } ---------------------------------------- */
30 /*                                                                           */
31 /* ========================================================================= */
32
33 /* ------------------------------------------------------------------------- */
34
35 /** Returns dot product of inVec with indexed row 
36     The result is a floating point expresstion:
37                 upper 16 bit: signed value
38                 lower 16 bit: signed exponent
39  */
40 int32 bts_CompactMat_fltDotPrdRow( struct bbs_Context* cpA, 
41                                                                    struct bts_CompactMat* ptrA, 
42                                                                const int16* inVecA,
43                                                                uint32 inNormBitsA,
44                                                                uint32 rowA )
45 {
46         const int16* rowPtrL = ptrA->cpsArrE.arrPtrE + ptrA->wordsPerRowE * rowA;
47
48         /* extract row-header info */
49         uint32 offsL = *rowPtrL++;
50         uint32 sizeL = *rowPtrL++;
51         int32 factorManL = *rowPtrL++;
52         int32 factorExpL = *rowPtrL++;
53         uint32 rowNormBitsL = *rowPtrL++;
54
55         /* consider possible overflow */
56         uint16 overflowBitsL = ( inNormBitsA + rowNormBitsL >= 31 ) ? inNormBitsA + rowNormBitsL - 31 : 0;
57
58         const int16* inPtrL = inVecA + offsL;
59
60         count_t iL;
61         int32 sumL = 0;
62
63         if( overflowBitsL == 0 ) /* raw dot product fits in int32 */
64         {
65                 switch( ptrA->bitsPerValueE )
66                 {
67                         case 16:
68                         {
69                                 for( iL = sizeL; iL > 0; iL-- ) sumL += ( ( int32 )*rowPtrL++ * ( int32 )*inPtrL++ );
70                         }
71                         break;
72
73                         #ifndef HW_TMS320C5x /* platforms that don't have int8 must use the 'default' implementation */
74
75                         case 8:
76                         {
77                                 const uint16* dpL = ( uint16* )rowPtrL;
78                                 for( iL = sizeL; iL >= 8; iL -= 8 )
79                                 {
80                                         sumL += ( ( int8 )  dpL[ 0 ]         * ( int32 )inPtrL[ 0 ] );
81                                         sumL += ( ( int8 )( dpL[ 0 ] >>  8 ) * ( int32 )inPtrL[ 1 ] );
82                                         sumL += ( ( int8 )  dpL[ 1 ]         * ( int32 )inPtrL[ 2 ] );
83                                         sumL += ( ( int8 )( dpL[ 1 ] >>  8 ) * ( int32 )inPtrL[ 3 ] );
84                                         sumL += ( ( int8 )  dpL[ 2 ]         * ( int32 )inPtrL[ 4 ] );
85                                         sumL += ( ( int8 )( dpL[ 2 ] >>  8 ) * ( int32 )inPtrL[ 5 ] );
86                                         sumL += ( ( int8 )  dpL[ 3 ]         * ( int32 )inPtrL[ 6 ] );
87                                         sumL += ( ( int8 )( dpL[ 3 ] >>  8 ) * ( int32 )inPtrL[ 7 ] );
88                                         dpL += 4;
89                                         inPtrL += 8;
90                                 }
91                                 for( ; iL >= 2; iL -= 2 )
92                                 {
93                                         sumL += ( ( int8 )  *dpL         * ( int32 )inPtrL[ 0 ] );
94                                         sumL += ( ( int8 )( *dpL >>  8 ) * ( int32 )inPtrL[ 1 ] );
95                                         dpL++;
96                                         inPtrL += 2;
97                                 }
98                                 if( iL > 0 )
99                                 {
100                                         sumL += ( ( int8 )*dpL++ * ( int32 )inPtrL[ 0 ] );
101                                 }
102                         }
103                         break;
104
105                         case 6:
106                         {
107                                 const uint16* dpL = ( uint16* )rowPtrL;
108                                 for( iL = sizeL; iL >= 8; iL -= 8 )
109                                 {
110                                         int32 lSumL = 0;
111                                         lSumL += ( ( int8 )     ( dpL[ 0 ] <<  2 )                                  * ( int32 )inPtrL[ 0 ] );
112                                         lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  4 )                       & 0x00FC ) * ( int32 )inPtrL[ 1 ] );
113                                         lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 10 ) | ( dpL[ 1 ] << 6 ) ) & 0x00FC ) * ( int32 )inPtrL[ 2 ] );
114                                         lSumL += ( ( int8 ) (   ( dpL[ 1 ]       )                       & 0x00FC ) * ( int32 )inPtrL[ 3 ] );
115                                         lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00FC ) * ( int32 )inPtrL[ 4 ] );
116                                         lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 12 ) | ( dpL[ 2 ] << 4 ) ) & 0x00FC ) * ( int32 )inPtrL[ 5 ] );
117                                         lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  2 )                       & 0x00FC ) * ( int32 )inPtrL[ 6 ] );
118                                         lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  8 )                       & 0x00FC ) * ( int32 )inPtrL[ 7 ] );
119                                         sumL += ( lSumL >> 2 );
120                                         dpL += 3;
121                                         inPtrL += 8;
122                                 }
123
124                                 {
125                                         int32 lSumL = 0;
126                                         if( iL > 0 ) lSumL += ( ( int8 )     ( dpL[ 0 ] <<  2 )                                  * ( int32 )inPtrL[ 0 ] );
127                                         if( iL > 1 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  4 )                       & 0x00FC ) * ( int32 )inPtrL[ 1 ] );
128                                         if( iL > 2 ) lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 10 ) | ( dpL[ 1 ] << 6 ) ) & 0x00FC ) * ( int32 )inPtrL[ 2 ] );
129                                         if( iL > 3 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ]       )                       & 0x00FC ) * ( int32 )inPtrL[ 3 ] );
130                                         if( iL > 4 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00FC ) * ( int32 )inPtrL[ 4 ] );
131                                         if( iL > 5 ) lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 12 ) | ( dpL[ 2 ] << 4 ) ) & 0x00FC ) * ( int32 )inPtrL[ 5 ] );
132                                         if( iL > 6 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  2 )                       & 0x00FC ) * ( int32 )inPtrL[ 6 ] );
133                                         sumL += ( lSumL >> 2 );
134                                 }
135                         }
136                         break;
137
138                         case 5: 
139                         {
140                                 const uint16* dpL = ( uint16* )rowPtrL;
141                                 for( iL = sizeL; iL >= 16; iL -= 16 )
142                                 {
143                                         int32 lSumL = 0;
144                                         lSumL += ( ( int8 )     ( dpL[ 0 ] <<  3 )                                  * ( int32 )inPtrL[  0 ] );
145                                         lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  2 )                       & 0x00F8 ) * ( int32 )inPtrL[  1 ] );
146                                         lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  7 )                       & 0x00F8 ) * ( int32 )inPtrL[  2 ] );
147                                         lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 12 ) | ( dpL[ 1 ] << 4 ) ) & 0x00F8 ) * ( int32 )inPtrL[  3 ] );
148                                         lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  1 )                       & 0x00F8 ) * ( int32 )inPtrL[  4 ] );
149                                         lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00F8 ) * ( int32 )inPtrL[  5 ] );
150                                         lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 11 ) | ( dpL[ 2 ] << 5 ) ) & 0x00F8 ) * ( int32 )inPtrL[  6 ] );
151                                         lSumL += ( ( int8 ) (   ( dpL[ 2 ]       )                       & 0x00F8 ) * ( int32 )inPtrL[  7 ] );
152                                         lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  5 )                       & 0x00F8 ) * ( int32 )inPtrL[  8 ] );
153                                         lSumL += ( ( int8 ) ( ( ( dpL[ 2 ] >> 10 ) | ( dpL[ 3 ] << 6 ) ) & 0x00F8 ) * ( int32 )inPtrL[  9 ] );
154                                         lSumL += ( ( int8 ) (   ( dpL[ 3 ] <<  1 )                       & 0x00F8 ) * ( int32 )inPtrL[ 10 ] );
155                                         lSumL += ( ( int8 ) (   ( dpL[ 3 ] >>  4 )                       & 0x00F8 ) * ( int32 )inPtrL[ 11 ] );
156                                         lSumL += ( ( int8 ) ( ( ( dpL[ 3 ] >>  9 ) | ( dpL[ 4 ] << 7 ) ) & 0x00F8 ) * ( int32 )inPtrL[ 12 ] );
157                                         lSumL += ( ( int8 ) (   ( dpL[ 4 ] <<  2 )                       & 0x00F8 ) * ( int32 )inPtrL[ 13 ] );
158                                         lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  3 )                       & 0x00F8 ) * ( int32 )inPtrL[ 14 ] );
159                                         lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  8 )                       & 0x00F8 ) * ( int32 )inPtrL[ 15 ] );
160                                         sumL += ( lSumL >> 3 );
161                                         dpL += 5;
162                                         inPtrL += 16;
163                                 }
164
165                                 {
166                                         int32 lSumL = 0;
167                                         if( iL >  0 ) lSumL += ( ( int8 )     ( dpL[ 0 ] <<  3 )                                  * ( int32 )inPtrL[  0 ] );
168                                         if( iL >  1 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  2 )                       & 0x00F8 ) * ( int32 )inPtrL[  1 ] );
169                                         if( iL >  2 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  7 )                       & 0x00F8 ) * ( int32 )inPtrL[  2 ] );
170                                         if( iL >  3 ) lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 12 ) | ( dpL[ 1 ] << 4 ) ) & 0x00F8 ) * ( int32 )inPtrL[  3 ] );
171                                         if( iL >  4 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  1 )                       & 0x00F8 ) * ( int32 )inPtrL[  4 ] );
172                                         if( iL >  5 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00F8 ) * ( int32 )inPtrL[  5 ] );
173                                         if( iL >  6 ) lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 11 ) | ( dpL[ 2 ] << 5 ) ) & 0x00F8 ) * ( int32 )inPtrL[  6 ] );
174                                         if( iL >  7 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ]       )                       & 0x00F8 ) * ( int32 )inPtrL[  7 ] );
175                                         if( iL >  8 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  5 )                       & 0x00F8 ) * ( int32 )inPtrL[  8 ] );
176                                         if( iL >  9 ) lSumL += ( ( int8 ) ( ( ( dpL[ 2 ] >> 10 ) | ( dpL[ 3 ] << 6 ) ) & 0x00F8 ) * ( int32 )inPtrL[  9 ] );
177                                         if( iL > 10 ) lSumL += ( ( int8 ) (   ( dpL[ 3 ] <<  1 )                       & 0x00F8 ) * ( int32 )inPtrL[ 10 ] );
178                                         if( iL > 11 ) lSumL += ( ( int8 ) (   ( dpL[ 3 ] >>  4 )                       & 0x00F8 ) * ( int32 )inPtrL[ 11 ] );
179                                         if( iL > 12 ) lSumL += ( ( int8 ) ( ( ( dpL[ 3 ] >>  9 ) | ( dpL[ 4 ] << 7 ) ) & 0x00F8 ) * ( int32 )inPtrL[ 12 ] );
180                                         if( iL > 13 ) lSumL += ( ( int8 ) (   ( dpL[ 4 ] <<  2 )                       & 0x00F8 ) * ( int32 )inPtrL[ 13 ] );
181                                         if( iL > 14 ) lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  3 )                       & 0x00F8 ) * ( int32 )inPtrL[ 14 ] );
182                                         sumL += ( lSumL >> 3 );
183                                 }
184                         }
185                         break;
186
187                         case 4: 
188                         {
189                                 for( iL = sizeL; iL >= 4; iL -= 4 )
190                                 {
191                                         uint16 v1L = *rowPtrL++;
192                                         int32 lSumL = 0;
193                                         lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
194                                         lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
195                                         lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
196                                         lSumL += ( ( int8 )( ( v1L >> 8 ) & 0xF0 ) * ( int32 )inPtrL[ 3 ] );
197                                         inPtrL += 4;
198                                         sumL += ( lSumL >> 4 );
199                                 }
200                                 {
201                                         uint16 v1L = *rowPtrL++;
202                                         int32 lSumL = 0;
203                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
204                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
205                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
206                                         sumL += ( lSumL >> 4 );
207                                 }
208                         }
209                         break;
210
211                         #endif /*ifndef HW_TMS320C5x*/
212
213                         /* The default case can process all bit sizes including those that are explicitly encoded above
214                          * Use the default for all bit sizes when the platform cannot handle the int8 data type (e.g. HW_TMS320C5x)
215                          */
216                         default:
217                         {
218                                 uint32 bfL = ( ( uint32 )*rowPtrL++ ) << 16;
219                                 uint32 bitsL = ptrA->bitsPerValueE;
220                                 uint16 adjL = 16 - bitsL;
221                                 uint32 mkL = ( ( 1 << bitsL ) - 1 ) << adjL;
222                                 uint32 srL = bitsL;
223                                 for( iL = 0; iL < sizeL; iL++ )
224                                 {
225                                         if( srL > 16 )
226                                         {
227                                                 bfL = ( ( ( uint32 )*rowPtrL++ ) << 16 ) | ( bfL >> 16 );
228                                                 srL -= 16;
229                                         }
230                                         sumL += ( ( int16 )( ( bfL >> srL ) & mkL ) * ( int32 )inPtrL[ iL ] ) >> adjL;
231                                         srL += bitsL;
232                                 }
233                         }
234                 }
235         }
236         else /* raw dot product does not fit in int32 */
237         {
238                 int32 roundL = 1 << ( overflowBitsL - 1 );
239                 switch( ptrA->bitsPerValueE )
240                 {
241                         case 16:
242                         {
243                                 for( iL = sizeL; iL > 0; iL-- ) sumL += ( ( ( int32 )*rowPtrL++ * ( int32 )*inPtrL++ ) + roundL ) >> overflowBitsL;
244                         }
245                         break;
246
247                         case 8: 
248                         {
249                                 for( iL = sizeL; iL >= 2; iL -= 2 )
250                                 {
251                                         uint16 v1L = *rowPtrL++;
252                                         int32 lSumL =   ( ( int8 )  v1L         * ( int32 )inPtrL[ 0 ] )
253                                                           + ( ( int8 )( v1L >>  8 ) * ( int32 )inPtrL[ 1 ] );
254                                         sumL += ( lSumL + roundL ) >> overflowBitsL;
255                                         inPtrL += 2;
256                                 }
257                                 if( iL > 0 )
258                                 {
259                                         sumL += ( ( ( int8 )*rowPtrL++ * ( int32 )inPtrL[ 0 ] ) + roundL ) >> overflowBitsL;
260                                 }
261                         }
262                         break;
263
264                         case 4: 
265                         {
266                                 for( iL = sizeL; iL >= 4; iL -= 4 )
267                                 {
268                                         uint16 v1L = *rowPtrL++;
269                                         int32 lSumL = 0;
270                                         lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
271                                         lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
272                                         lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
273                                         lSumL += ( ( int8 )( ( v1L >> 8 ) & 0xF0 ) * ( int32 )inPtrL[ 3 ] );
274                                         inPtrL += 4;
275                                         sumL += ( ( lSumL >> 4 ) + roundL ) >> overflowBitsL;
276                                 }
277                                 {
278                                         uint16 v1L = *rowPtrL++;
279                                         int32 lSumL = 0;
280                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
281                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
282                                         if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
283                                         sumL += ( ( lSumL >> 4 ) + roundL ) >> overflowBitsL;
284                                 }
285                         }
286                         break;
287
288                         default:
289                         {
290                                 uint32 bfL = ( ( uint32 )*rowPtrL++ ) << 16;
291                                 uint32 bitsL = ptrA->bitsPerValueE;
292                                 uint16 adjL = 16 - bitsL;
293                                 uint32 mkL = ( ( 1 << bitsL ) - 1 ) << adjL;
294                                 uint32 srL = bitsL;
295                                 int32 lRoundL = roundL << adjL;
296                                 int32 lAdjL = overflowBitsL + adjL;
297                                 for( iL = 0; iL < sizeL; iL++ )
298                                 {
299                                         if( srL > 16 )
300                                         {
301                                                 bfL = ( ( ( uint32 )*rowPtrL++ ) << 16 ) | ( bfL >> 16 );
302                                                 srL -= 16;
303                                         }
304                                         sumL += ( ( int16 )( ( bfL >> srL ) & mkL ) * ( int32 )inPtrL[ iL ] + lRoundL ) >> lAdjL;
305                                         srL += bitsL;
306                                 }
307                         }
308                 }
309         }
310
311         /* compute result */
312         {
313                 int32 resultManL;
314                 int32 resultExpL;
315                 int32 resultLogL;
316                 bbs_mulS32( sumL, factorManL, &resultManL, &resultExpL );
317                 resultExpL += factorExpL + overflowBitsL;
318                 resultLogL = bbs_intLog2( resultManL > 0 ? resultManL : -resultManL );
319                 if( resultLogL < 30 )
320                 {
321                         resultManL <<= 30 - resultLogL;
322                         resultExpL  -= 30 - resultLogL;
323                 }
324
325                 resultManL = ( ( resultManL >> 15 ) + 1 ) >> 1;
326                 resultExpL = resultExpL + 16;
327
328                 return ( ( resultManL & 0x0000FFFF ) << 16 ) | ( resultExpL & 0x0000FFFF );
329         }
330 }
331
332 /* ------------------------------------------------------------------------- */
333
334 /* ========================================================================= */
335 /*                                                                           */
336 /* ---- \ghd{ constructor / destructor } ----------------------------------- */
337 /*                                                                           */
338 /* ========================================================================= */
339
340 /* ------------------------------------------------------------------------- */
341
342 void bts_CompactMat_init( struct bbs_Context* cpA,
343                                               struct bts_CompactMat* ptrA )
344 {
345         ptrA->widthE = 0;
346         ptrA->heightE = 0;
347         ptrA->bitsPerValueE = 0;
348         ptrA->wordsPerRowE = 0;
349         ptrA->maxRowBitsE = 0;
350         bbs_Int16Arr_init( cpA, &ptrA->cpsArrE );
351         bbs_Int16Arr_init( cpA, &ptrA->expArrE );
352         
353 }
354
355 /* ------------------------------------------------------------------------- */
356
357 void bts_CompactMat_exit( struct bbs_Context* cpA,
358                                             struct bts_CompactMat* ptrA )
359 {
360         ptrA->widthE = 0;
361         ptrA->heightE = 0;
362         ptrA->bitsPerValueE = 0;
363         ptrA->wordsPerRowE = 0;
364         ptrA->maxRowBitsE = 0;
365         bbs_Int16Arr_exit( cpA, &ptrA->cpsArrE );
366         bbs_Int16Arr_exit( cpA, &ptrA->expArrE );
367 }
368 /* ------------------------------------------------------------------------- */
369
370 /* ========================================================================= */
371 /*                                                                           */
372 /* ---- \ghd{ operators } -------------------------------------------------- */
373 /*                                                                           */
374 /* ========================================================================= */
375
376 /* ------------------------------------------------------------------------- */
377
378 /* ========================================================================= */
379 /*                                                                           */
380 /* ---- \ghd{ query functions } -------------------------------------------- */
381 /*                                                                           */
382 /* ========================================================================= */
383
384 /* ------------------------------------------------------------------------- */
385
386 /* ========================================================================= */
387 /*                                                                           */
388 /* ---- \ghd{ modify functions } ------------------------------------------- */
389 /*                                                                           */
390 /* ========================================================================= */
391
392 /* ------------------------------------------------------------------------- */
393         
394 void bts_CompactMat_create( struct bbs_Context* cpA,
395                                                     struct bts_CompactMat* ptrA, 
396                                                     uint32 widthA,
397                                                     uint32 heightA,
398                                                     uint32 bitsA,
399                                                         uint32 maxRowSizeA,
400                                             struct bbs_MemSeg* mspA )
401 {
402         if( bbs_Context_error( cpA ) ) return;
403         if( bitsA < 2 || bitsA > 16 )
404         {
405                 bbs_ERROR0( "bts_CompactMat_create:\nbitsA must be between 2 and 16" );
406                 return;
407         }
408
409         ptrA->widthE = widthA;
410         ptrA->heightE = heightA;
411         ptrA->bitsPerValueE = bitsA;
412         ptrA->wordsPerRowE = 6 /*header + 1*/ + ( ( maxRowSizeA * bitsA ) / ( 8 * sizeof( short ) ) );
413         ptrA->maxRowBitsE = 0;
414         if( ( ptrA->wordsPerRowE & 1 ) != 0 ) ptrA->wordsPerRowE++;
415         bbs_Int16Arr_create( cpA, &ptrA->cpsArrE, heightA * ptrA->wordsPerRowE, mspA );
416         bbs_Int16Arr_fill( cpA, &ptrA->cpsArrE, 0 );
417         bbs_Int16Arr_create( cpA, &ptrA->expArrE, ptrA->heightE, mspA );
418         bbs_Int16Arr_fill( cpA, &ptrA->expArrE, 0 );
419 }
420
421 /* ------------------------------------------------------------------------- */
422         
423 void bts_CompactMat_copy( struct bbs_Context* cpA,
424                                               struct bts_CompactMat* ptrA, 
425                                                   const struct bts_CompactMat* srcPtrA )
426 {
427         ptrA->widthE = srcPtrA->widthE;
428         ptrA->heightE = srcPtrA->heightE;
429         ptrA->bitsPerValueE = srcPtrA->bitsPerValueE;
430         ptrA->wordsPerRowE = srcPtrA->wordsPerRowE;
431         ptrA->maxRowBitsE = srcPtrA->maxRowBitsE;
432         bbs_Int16Arr_copy( cpA, &ptrA->cpsArrE, &srcPtrA->cpsArrE );
433         bbs_Int16Arr_size( cpA, &ptrA->expArrE, ptrA->heightE );
434 }
435
436 /* ------------------------------------------------------------------------- */
437         
438 /* ========================================================================= */
439 /*                                                                           */
440 /* ---- \ghd{ I/O } -------------------------------------------------------- */
441 /*                                                                           */
442 /* ========================================================================= */
443
444 /* ------------------------------------------------------------------------- */
445         
446 uint32 bts_CompactMat_memSize( struct bbs_Context* cpA,
447                                                          const struct bts_CompactMat *ptrA )
448 {
449         return  bbs_SIZEOF16( uint32 )
450                   + bbs_SIZEOF16( uint32 ) /* version */
451                   + bbs_SIZEOF16( ptrA->widthE ) 
452                   + bbs_SIZEOF16( ptrA->heightE ) 
453                   + bbs_SIZEOF16( ptrA->bitsPerValueE ) 
454                   + bbs_SIZEOF16( ptrA->wordsPerRowE )
455                   + bbs_SIZEOF16( ptrA->maxRowBitsE )
456                   + bbs_Int16Arr_memSize( cpA, &ptrA->cpsArrE );
457 }
458
459 /* ------------------------------------------------------------------------- */
460         
461 uint32 bts_CompactMat_memWrite( struct bbs_Context* cpA,
462                                                           const struct bts_CompactMat* ptrA, 
463                                                           uint16* memPtrA )
464 {
465         uint32 memSizeL = bts_CompactMat_memSize( cpA, ptrA );
466         memPtrA += bbs_memWrite32( &memSizeL, memPtrA );
467         memPtrA += bbs_memWriteUInt32( bts_COMPACT_MAT_VERSION, memPtrA );
468         memPtrA += bbs_memWrite32( &ptrA->widthE, memPtrA );
469         memPtrA += bbs_memWrite32( &ptrA->heightE, memPtrA );
470         memPtrA += bbs_memWrite32( &ptrA->bitsPerValueE, memPtrA );
471         memPtrA += bbs_memWrite32( &ptrA->wordsPerRowE, memPtrA );
472         memPtrA += bbs_memWrite32( &ptrA->maxRowBitsE, memPtrA );
473         memPtrA += bbs_Int16Arr_memWrite( cpA, &ptrA->cpsArrE, memPtrA );
474         return memSizeL;
475 }
476
477 /* ------------------------------------------------------------------------- */
478         
479 uint32 bts_CompactMat_memRead( struct bbs_Context* cpA,
480                                                          struct bts_CompactMat* ptrA, 
481                                                          const uint16* memPtrA,
482                                              struct bbs_MemSeg* mspA )
483 {
484         uint32 memSizeL, versionL;
485         if( bbs_Context_error( cpA ) ) return 0;
486         memPtrA += bbs_memRead32( &memSizeL, memPtrA );
487         memPtrA += bbs_memReadVersion32( cpA, &versionL, bts_COMPACT_MAT_VERSION, memPtrA );
488         memPtrA += bbs_memRead32( &ptrA->widthE, memPtrA );
489         memPtrA += bbs_memRead32( &ptrA->heightE, memPtrA );
490         memPtrA += bbs_memRead32( &ptrA->bitsPerValueE, memPtrA );
491         memPtrA += bbs_memRead32( &ptrA->wordsPerRowE, memPtrA );
492         memPtrA += bbs_memRead32( &ptrA->maxRowBitsE, memPtrA );
493         memPtrA += bbs_Int16Arr_memRead( cpA, &ptrA->cpsArrE, memPtrA, mspA );
494
495         if( memSizeL != bts_CompactMat_memSize( cpA, ptrA ) )
496         {
497                 bbs_ERR0( bbs_ERR_CORRUPT_DATA, "uint32 bts_CompactMat_memRead( const struct bts_CompactMat* ptrA, const void* memPtrA ):\n"
498                   "size mismatch" ); 
499         }
500
501         bbs_Int16Arr_create( cpA, &ptrA->expArrE, ptrA->heightE, mspA );
502         bbs_Int16Arr_fill( cpA, &ptrA->expArrE, 0 );
503
504         return memSizeL;
505 }
506
507 /* ------------------------------------------------------------------------- */
508         
509 /* ========================================================================= */
510 /*                                                                           */
511 /* ---- \ghd{ exec functions } --------------------------------------------- */
512 /*                                                                           */
513 /* ========================================================================= */
514
515 /* ------------------------------------------------------------------------- */
516
517 void bts_CompactMat_map( struct bbs_Context* cpA, 
518                                                  const struct bts_CompactMat* ptrA, 
519                                                  const int16* inVecA,
520                                                  int16* outVecA,
521                                                  int16* outExpPtrA )
522 {
523         uint32 inNormBitsL = bbs_intLog2( bbs_vecNorm16( inVecA, ptrA->widthE ) ) + 1;
524         uint32 iL;
525
526         int16* expArrL = ( ( struct bts_CompactMat* )ptrA )->expArrE.arrPtrE;
527         int16 maxExpL = -32767;
528
529         for( iL = 0; iL < ptrA->heightE; iL++ )
530         {
531                 int32 fltL = bts_CompactMat_fltDotPrdRow( cpA, ( struct bts_CompactMat* )ptrA, inVecA, inNormBitsL, iL );
532                 outVecA[ iL ] = fltL >> 16; 
533                 expArrL[ iL ] = fltL & 0x0000FFFF;
534
535                 maxExpL = ( expArrL[ iL ] > maxExpL ) ? expArrL[ iL ] : maxExpL;
536         }
537
538         if( outExpPtrA != NULL ) *outExpPtrA = maxExpL;
539
540         for( iL = 0; iL < ptrA->heightE; iL++ )
541         {
542                 int32 shrL = maxExpL - expArrL[ iL ];
543                 if( shrL > 0 )
544                 {
545                         outVecA[ iL ] = ( ( outVecA[ iL ] >> ( shrL - 1 ) ) + 1 ) >> 1;
546                 }
547         }
548 }
549
550 /* ------------------------------------------------------------------------- */
551
552 /* ========================================================================= */
553