tiny_dnn 1.0.0
A header only, dependency-free deep learning framework in C++11
Loading...
Searching...
No Matches
product.h
1/*
2 Copyright (c) 2013, Taiga Nomi
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the <organization> nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27#pragma once
28#if defined(CNN_USE_SSE) || defined(CNN_USE_AVX)
29#include <immintrin.h>
30#endif
31#include <cstdint>
32#include <cassert>
33#include <numeric>
34
35#if defined(_MSC_VER)
36#define VECTORIZE_ALIGN(x) __declspec(align(x))
37#elif defined(__GNUC__)
38#define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
39#else
40#define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
41#endif
42
43namespace vectorize {
44namespace detail {
45
46
47template<typename T>
48inline bool is_aligned(T, const typename T::value_type* /*p*/) {
49 return true;
50}
51
52template<typename T>
53inline bool is_aligned(T, const typename T::value_type* p1, const typename T::value_type* p2) {
54 return is_aligned(T(), p1) && is_aligned(T(), p2);
55}
56
57// traits
58
59template <typename T>
61 typedef T register_type;
62 typedef T value_type;
63 enum {
64 unroll_size = 1
65 };
66 static register_type set1(const value_type& x) { return x; }
67 static register_type zero() { return register_type(0); }
68 static register_type mul(const register_type& v1, const register_type& v2) { return v1 * v2; }
69 static register_type add(const register_type& v1, const register_type& v2) { return v1 + v2; }
70 static register_type load(const value_type* px) { return *px; }
71 static register_type loadu(const value_type* px) { return *px; }
72 static void store(value_type* px, const register_type& v) { *px = v; }
73 static void storeu(value_type* px, const register_type& v) { *px = v; }
74 static value_type resemble(const register_type& x) { return x; }
75};
76
77#ifdef CNN_USE_SSE
78
79struct float_sse {
80 typedef __m128 register_type;
81 typedef float value_type;
82 enum {
83 unroll_size = 4
84 };
85 static register_type set1(const value_type& x) { return _mm_set1_ps(x); }
86 static register_type zero() { register_type v = {}; return v; }
87 static register_type mul(const register_type& v1, const register_type& v2) { return _mm_mul_ps(v1, v2); }
88 static register_type add(const register_type& v1, const register_type& v2) { return _mm_add_ps(v1, v2); }
89 static register_type load(const value_type* px) { return _mm_load_ps(px); }
90 static register_type loadu(const value_type* px) { return _mm_loadu_ps(px); }
91 static void store(value_type* px, const register_type& v) { _mm_store_ps(px, v); }
92 static void storeu(value_type* px, const register_type& v) { _mm_storeu_ps(px, v); }
93 static value_type resemble(const register_type& x) {
94 VECTORIZE_ALIGN(16) float tmp[4];
95 _mm_store_ps(tmp, x);
96 return tmp[0] + tmp[1] + tmp[2] + tmp[3];
97 }
98};
99
100struct double_sse {
101 typedef __m128d register_type;
102 typedef double value_type;
103 enum {
104 unroll_size = 2
105 };
106 static register_type set1(const value_type& x) { return _mm_set1_pd(x); }
107 static register_type zero() { register_type v = {}; return v; }
108 static register_type mul(const register_type& v1, const register_type& v2) { return _mm_mul_pd(v1, v2); }
109 static register_type add(const register_type& v1, const register_type& v2) { return _mm_add_pd(v1, v2); }
110 static register_type load(const value_type* px) { return _mm_load_pd(px); }
111 static register_type loadu(const value_type* px) { return _mm_loadu_pd(px); }
112 static void store(value_type* px, const register_type& v) { _mm_store_pd(px, v); }
113 static void storeu(value_type* px, const register_type& v) { _mm_storeu_pd(px, v); }
114 static value_type resemble(const register_type& x) {
115 VECTORIZE_ALIGN(16) double tmp[2];
116 _mm_store_pd(tmp, x);
117 return tmp[0] + tmp[1];
118 }
119};
120
121template<typename T>
122struct sse {};
123template<>
124struct sse<float> : public float_sse {};
125template<>
126struct sse<double> : public double_sse {};
127
128template<typename T>
129inline bool is_aligned(sse<T>, const typename sse<T>::value_type* p) {
130 return reinterpret_cast<std::size_t>(p) % 16 == 0;
131}
132
133#endif // CNN_USE_SSE
134
135#ifdef CNN_USE_AVX
136
137struct float_avx {
138 typedef __m256 register_type;
139 typedef float value_type;
140 enum {
141 unroll_size = 8
142 };
143 static register_type set1(const value_type& x) { return _mm256_set1_ps(x); }
144 static register_type zero() { register_type v = {}; return v; }
145 static register_type mul(const register_type& v1, const register_type& v2) { return _mm256_mul_ps(v1, v2); }
146 static register_type add(const register_type& v1, const register_type& v2) { return _mm256_add_ps(v1, v2); }
147 static register_type load(const value_type* px) { return _mm256_load_ps(px); }
148 static register_type loadu(const value_type* px) { return _mm256_loadu_ps(px); }
149 static void store(value_type* px, const register_type& v) { _mm256_store_ps(px, v); }
150 static void storeu(value_type* px, const register_type& v) { _mm256_storeu_ps(px, v); }
151 static value_type resemble(const register_type& x) {
152 VECTORIZE_ALIGN(32) float tmp[8];
153 _mm256_store_ps(tmp, x);
154 return std::accumulate(tmp, tmp + 8, 0.0f);
155 }
156};
157
158struct double_avx {
159 typedef __m256d register_type;
160 typedef double value_type;
161 enum {
162 unroll_size = 4
163 };
164 static register_type set1(const value_type& x) { return _mm256_set1_pd(x); }
165 static register_type zero() { register_type v = {}; return v; }
166 static register_type mul(const register_type& v1, const register_type& v2) { return _mm256_mul_pd(v1, v2); }
167 static register_type add(const register_type& v1, const register_type& v2) { return _mm256_add_pd(v1, v2); }
168 static register_type load(const value_type* px) { return _mm256_load_pd(px); }
169 static register_type loadu(const value_type* px) { return _mm256_loadu_pd(px); }
170 static void store(value_type* px, const register_type& v) { _mm256_store_pd(px, v); }
171 static void storeu(value_type* px, const register_type& v) { _mm256_storeu_pd(px, v); }
172 static value_type resemble(const register_type& x) {
173 VECTORIZE_ALIGN(32) double tmp[4];
174 _mm256_store_pd(tmp, x);
175 return std::accumulate(tmp, tmp + 4, 0.0);
176 }
177};
178
179template<typename T>
180struct avx {};
181template<>
182struct avx<float> : public float_avx {};
183template<>
184struct avx<double> : public double_avx {};
185
186template<typename T>
187inline bool is_aligned(avx<T>, const typename avx<T>::value_type* p) {
188 return reinterpret_cast<std::size_t>(p) % 32 == 0;
189}
190
191#endif // CNN_USE_AVX
192
193// generic dot-product
194template<typename T>
195inline typename T::value_type dot_product_nonaligned(const typename T::value_type* f1, const typename T::value_type* f2, std::size_t size) {
196 typename T::register_type result = T::zero();
197
198 for (std::size_t i = 0; i < size/T::unroll_size; i++)
199 result = T::add(result, T::mul(T::loadu(&f1[i*T::unroll_size]), T::loadu(&f2[i*T::unroll_size])));
200
201 typename T::value_type sum = T::resemble(result);
202
203 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
204 sum += f1[i] * f2[i];
205
206 return sum;
207}
208
209// generic dot-product(aligned)
210template<typename T>
211inline typename T::value_type dot_product_aligned(const typename T::value_type* f1, const typename T::value_type* f2, std::size_t size) {
212 typename T::register_type result = T::zero();
213
214 assert(is_aligned(T(), f1));
215 assert(is_aligned(T(), f2));
216
217 for (std::size_t i = 0; i < size/T::unroll_size; i++)
218 result = T::add(result, T::mul(T::load(&f1[i*T::unroll_size]), T::load(&f2[i*T::unroll_size])));
219
220 typename T::value_type sum = T::resemble(result);
221
222 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
223 sum += f1[i] * f2[i];
224
225 return sum;
226}
227
228template<typename T>
229inline void muladd_aligned(const typename T::value_type* src, typename T::value_type c, std::size_t size, typename T::value_type* dst) {
230 typename T::register_type factor = T::set1(c);
231
232 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
233 typename T::register_type d = T::load(&dst[i*T::unroll_size]);
234 typename T::register_type s = T::load(&src[i*T::unroll_size]);
235 T::store(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
236 }
237
238 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
239 dst[i] += src[i] * c;
240}
241
242
243template<typename T>
244inline void muladd_nonaligned(const typename T::value_type* src, typename T::value_type c, std::size_t size, typename T::value_type* dst) {
245 typename T::register_type factor = T::set1(c);
246
247 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
248 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
249 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
250 T::storeu(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
251 }
252
253 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
254 dst[i] += src[i] * c;
255}
256
257template<typename T>
258inline void reduce_nonaligned(const typename T::value_type* src, std::size_t size, typename T::value_type* dst) {
259 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
260 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
261 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
262 T::storeu(&dst[i*T::unroll_size], T::add(d, s));
263 }
264
265 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
266 dst[i] += src[i];
267}
268
269template<typename T>
270inline void reduce_aligned(const typename T::value_type* src, std::size_t size, typename T::value_type* dst) {
271 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
272 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
273 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
274 T::storeu(&dst[i*T::unroll_size], T::add(d, s));
275 }
276
277 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
278 dst[i] += src[i];
279}
280
281} // namespace detail
282
283#if defined(CNN_USE_AVX)
284#define VECTORIZE_TYPE(T) detail::avx<T>
285#elif defined(CNN_USE_SSE)
286#define VECTORIZE_TYPE(T) detail::sse<T>
287#else
288#define VECTORIZE_TYPE(T) detail::generic_vec_type<T>
289#endif
290
291// dst[i] += c * src[i]
292template<typename T>
293void muladd(const T* src, T c, std::size_t size, T* dst) {
294 if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
295 detail::muladd_aligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
296 else
297 detail::muladd_nonaligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
298}
299
300// sum(s1[i] * s2[i])
301template<typename T>
302T dot(const T* s1, const T* s2, std::size_t size) {
303 if (detail::is_aligned(VECTORIZE_TYPE(T)(), s1, s2))
304 return detail::dot_product_aligned<VECTORIZE_TYPE(T)>(s1, s2, size);
305 else
306 return detail::dot_product_nonaligned<VECTORIZE_TYPE(T)>(s1, s2, size);
307}
308
310template<typename T>
311void reduce(const T* src, std::size_t size, T* dst) {
312 if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
313 return detail::reduce_aligned<VECTORIZE_TYPE(T)>(src, size, dst);
314 else
315 return detail::reduce_nonaligned<VECTORIZE_TYPE(T)>(src, size, dst);
316}
317
318} // namespace vectorize