Support
Quality
Security
License
Reuse
Coming Soon for all Libraries!
Currently covering the most popular Java, JavaScript and Python libraries. See a SAMPLE HERE.
kandi's functional review helps you automatically verify the functionalities of the libraries and avoid rework.
Latest [Git](https://git-scm.com/downloads) with Large File Support selected in the setup on the components dialog.
[Visual Studio 2022](https://www.visualstudio.com/downloads/) with the following workloads: .NET desktop development with .NET Framework 4.7.2 targeting pack Desktop development with C++ with Windows 10 SDK (10.0.18362.0) (it’s currently enabled by default but it might change) MSVC v143 - VS2022 C++ x64/x86 build tools (v14.30) or later version (should be enabled by default) C++/CLI support for v143 build tools (v14.30) or later version (not enabled by default) Optional (to target UWP): Universal Windows Platform development with Windows 10 SDK (10.0.18362.0) or later version MSVC v143 - VS2022 C++ ARM build tools (v14.30) or later version (not enabled by default) Optional (to target iOS/Android): Mobile development with .NET and Android SDK setup (API level 27) individual component, then in Visual Studio go to Tools > Android > Android SDK Manager and install NDK (version 19+) from Tools tab.
[FBX SDK 2019.0 VS2015](https://www.autodesk.com/developer-network/platform-technologies/fbx-sdk-2019-0)
Open a command prompt, point it to a directory and clone Stride to it: git clone https://github.com/stride3d/stride.git
Open <StrideDir>\build\Stride.sln with Visual Studio 2022 and build Stride.GameStudio (it should be the default startup project) or run it from VS’s toolbar. Optionally, open and build Stride.Android.sln, Stride.iOS.sln, etc.
Install [Visual Studio Build Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe) with the same prerequisites listed above
Add MSBuild’s directory to your system’s PATH (ex: C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Current\Bin)
Open a command prompt, point it to a directory and clone Stride to it: git clone https://github.com/stride3d/stride.git
Navigate to /Build with the command prompt, input msbuild /t:Restore Stride.sln then compile.bat
variadic template: SFINAE on last argument
TEST_CASE("index")
{
std::vector<size_t> strides = { 4, 6, 3, 5 };
SECTION("Padding with leading zeros")
{
constexpr auto i0 = 4;
constexpr auto i1 = 2;
constexpr size_t expected = i0 + i1 * 5;
CHECK(index(strides, 0, 0, i1, i0) == expected);
CHECK(index(strides, 0, 0, i1, i0 - 1) == expected - 1); // last index indexes by one
CHECK(index(strides, i1, i0) == expected);
CHECK(index(strides, i1, i0 - 1) == expected - 1);
}
SECTION("Use my::missing to use padding with tailing zeros")
{
constexpr auto i2 = 4;
constexpr auto i3 = 2;
constexpr size_t expected = (i3 * 6 + i2) * 5 * 3;
CHECK(index(strides, i3, i2, 0, 0) == expected);
CHECK(index(strides, i3, i2, my::missing) == expected);
}
}
template <typename T, typename... Ts>
struct last_type_helper {
using type = typename last_type_helper<Ts...>::type;
};
template <typename T>
struct last_type_helper<T> {
using type = T;
};
template <typename... Ts>
using last_type = typename last_type_helper<Ts...>::type;
enum class my { missing };
template <typename... Ts>
constexpr bool LastTypeIsMy = std::is_same<my, last_type<Ts...>>::value;
template <class StrideIter>
size_t index_impl(size_t base, StrideIter)
{
return base;
}
template <class StrideIter>
size_t index_impl(size_t base, StrideIter, my)
{
return base;
}
template <class StrideIter, typename Tn, typename... Ts>
size_t index_impl(size_t base, StrideIter it, Tn xn, Ts... x)
{
return index_impl(base * *it + xn, it + 1, x...);
}
template <class S, class... Args>
size_t index(const S& strides, Args... args)
{
const size_t offset = strides.size() - sizeof...(Args);
const size_t advenceBy = LastTypeIsMy<Args...> ? 0 : offset;
const size_t lastStrides = LastTypeIsMy<Args...> ? offset + 1 : 0;
const size_t tailFactor = std::accumulate(std::end(strides) - lastStrides, std::end(strides),
size_t { 1 }, std::multiplies<> {});
return index_impl(0, std::begin(strides) + advenceBy, args...) * tailFactor;
}
-----------------------
TEST_CASE("index")
{
std::vector<size_t> strides = { 4, 6, 3, 5 };
SECTION("Padding with leading zeros")
{
constexpr auto i0 = 4;
constexpr auto i1 = 2;
constexpr size_t expected = i0 + i1 * 5;
CHECK(index(strides, 0, 0, i1, i0) == expected);
CHECK(index(strides, 0, 0, i1, i0 - 1) == expected - 1); // last index indexes by one
CHECK(index(strides, i1, i0) == expected);
CHECK(index(strides, i1, i0 - 1) == expected - 1);
}
SECTION("Use my::missing to use padding with tailing zeros")
{
constexpr auto i2 = 4;
constexpr auto i3 = 2;
constexpr size_t expected = (i3 * 6 + i2) * 5 * 3;
CHECK(index(strides, i3, i2, 0, 0) == expected);
CHECK(index(strides, i3, i2, my::missing) == expected);
}
}
template <typename T, typename... Ts>
struct last_type_helper {
using type = typename last_type_helper<Ts...>::type;
};
template <typename T>
struct last_type_helper<T> {
using type = T;
};
template <typename... Ts>
using last_type = typename last_type_helper<Ts...>::type;
enum class my { missing };
template <typename... Ts>
constexpr bool LastTypeIsMy = std::is_same<my, last_type<Ts...>>::value;
template <class StrideIter>
size_t index_impl(size_t base, StrideIter)
{
return base;
}
template <class StrideIter>
size_t index_impl(size_t base, StrideIter, my)
{
return base;
}
template <class StrideIter, typename Tn, typename... Ts>
size_t index_impl(size_t base, StrideIter it, Tn xn, Ts... x)
{
return index_impl(base * *it + xn, it + 1, x...);
}
template <class S, class... Args>
size_t index(const S& strides, Args... args)
{
const size_t offset = strides.size() - sizeof...(Args);
const size_t advenceBy = LastTypeIsMy<Args...> ? 0 : offset;
const size_t lastStrides = LastTypeIsMy<Args...> ? offset + 1 : 0;
const size_t tailFactor = std::accumulate(std::end(strides) - lastStrides, std::end(strides),
size_t { 1 }, std::multiplies<> {});
return index_impl(0, std::begin(strides) + advenceBy, args...) * tailFactor;
}
-----------------------
template<typename... Idxs>
constexpr int indexing_type()
{
size_t integrals = 0;
bool unused[] = {(integrals += std::is_integral<Idxs>::value, false)...};
(void)unused;
bool mys[] = {std::is_same<my, Idxs>::value...};
bool last = mys[sizeof(mys) - 1];
if(integrals == sizeof...(Idxs))
return 1;
if(integrals == sizeof...(Idxs) - 1 && last)
return 2;
return 0;
}
template<typename S, size_t... Is, typename... Idxs>
inline auto mul_reduce(const S& strides, size_t off, std::index_sequence<Is...>, Idxs... idxs)
{
size_t sum = 0;
bool unused[] = {(sum += strides[off + Is] * size_t(idxs), false)...};
(void)unused;
return sum;
}
template<typename S, typename... Idxs>
inline auto index(const S& strides, Idxs... idxs)
-> std::enable_if_t<indexing_type<Idxs...>() == 1, size_t>
{
auto off = strides.size() - sizeof...(Idxs);
return mul_reduce(strides, off, std::make_index_sequence<sizeof...(Idxs)>{}, idxs...);
}
template<typename S, typename... Idxs>
inline auto index(const S& strides, Idxs... idxs)
-> std::enable_if_t<indexing_type<Idxs...>() == 2, size_t>
{
return mul_reduce(strides, 0, std::make_index_sequence<sizeof...(Idxs)>{}, idxs...);
}
Convolution Function Latency Bottleneck
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
-----------------------
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 512, 512, 3), found shape=(512, 512, 3)
import tensorflow as tf
inputs = tf.keras.layers.Input((512, 512, 3))
c1 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer="he_normal", padding="same")(inputs)
c1 = tf.keras.layers.Dropout(0.1)(c1)
c1 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer="he_normal", padding="same")(c1)
p1 = tf.keras.layers.MaxPooling2D((2, 2))(c1)
c2 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(p1)
c2 = tf.keras.layers.Dropout(0.1)(c2)
c2 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c2)
p2 = tf.keras.layers.MaxPooling2D((2, 2))(c2)
c3 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(p2)
c3 = tf.keras.layers.Dropout(0.2)(c3)
c3 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c3)
p3 = tf.keras.layers.MaxPooling2D((2, 2))(c3)
c4 = tf.keras.layers.Conv2D(128, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(p3)
c4 = tf.keras.layers.Dropout(0.2)(c4)
c4 = tf.keras.layers.Conv2D(128, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c4)
p4 = tf.keras.layers.MaxPooling2D((2, 2))(c4)
c5 = tf.keras.layers.Conv2D(256, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(p4)
c5 = tf.keras.layers.Dropout(0.3)(c5)
c5 = tf.keras.layers.Conv2D(256, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c5)
u6 = tf.keras.layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding="same")(c5)
u6 = tf.keras.layers.concatenate([u6, c4])
c6 = tf.keras.layers.Conv2D(128, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(u6)
c6 = tf.keras.layers.Dropout(0.2)(c6)
c6 = tf.keras.layers.Conv2D(128, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c6)
u7 = tf.keras.layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding="same")(c6)
u7 = tf.keras.layers.concatenate([u7, c3])
c7 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(u7)
c7 = tf.keras.layers.Dropout(0.2)(c7)
c7 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c7)
u8 = tf.keras.layers.Conv2DTranspose(32, (2, 2), strides=(2, 2), padding="same")(c7)
u8 = tf.keras.layers.concatenate([u8, c2])
c8 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(u8)
c8 = tf.keras.layers.Dropout(0.1)(c8)
c8 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(c8)
u9 = tf.keras.layers.Conv2DTranspose(16, (2, 2), strides=(2, 2), padding="same")(c8)
u9 = tf.keras.layers.concatenate([u9, c1], axis=3)
outputs = tf.keras.layers.Conv2D(1, (1, 1), activation="sigmoid")(u9)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()
How can I return a multidimensional array from a shared library in C to python as an np.array?
import ctypes
import numpy as np
# Example of input
a = np.empty((16, 16, 12, 12), dtype=np.float64)
b = np.empty((8, 8, 4, 4), dtype=np.float64)
# Better than CDLL regarding the Numpy documentation.
# Here the DLL/SO file is found in:
# Windows: ".\utils.dll"
# Linux: "./libutils.so"
utils = np.ctypeslib.load_library('utils', '.')
INT = ctypes.c_int64
PINT = ctypes.POINTER(ctypes.c_int64)
PDOUBLE = ctypes.POINTER(ctypes.c_double)
ND_POINTER_4 = np.ctypeslib.ndpointer(dtype=np.float64, ndim=4, flags="C_CONTIGUOUS")
utils.somefunction.argtypes = [
ND_POINTER_4, INT, INT, INT, INT,
ND_POINTER_4, INT, INT, INT, INT,
PINT, PINT, PINT, PINT, PINT
]
utils.somefunction.restype = PDOUBLE
d1_out, d2_out, d3_out, d4_out, d5_out = INT(), INT(), INT(), INT(), INT()
p_d1_out = ctypes.pointer(d1_out)
p_d2_out = ctypes.pointer(d2_out)
p_d3_out = ctypes.pointer(d3_out)
p_d4_out = ctypes.pointer(d4_out)
p_d5_out = ctypes.pointer(d5_out)
out = utils.somefunction(a, a.shape[0], a.shape[1], a.shape[2], a.shape[3],
b, b.shape[0], b.shape[1], b.shape[2], b.shape[3],
p_d1_out, p_d2_out, p_d3_out, p_d4_out, p_d5_out)
d1_out = d1_out.value
d2_out = d2_out.value
d3_out = d3_out.value
d4_out = d4_out.value
d5_out = d5_out.value
result = np.ctypeslib.as_array(out, shape=(d1_out, d2_out, d3_out, d4_out, d5_out))
# Some operations
# WARNING:
# You should free the memory of the allocated buffer
# with `free(out)` when you are done with `result`
# since Numpy does not free it for you: it just creates
# a view and does not take the ownership.
# Note that the right libc must be used, otherwise the
# call to free will cause an undefined behaviour
# (eg. crash, error message, nothing)
/* utils.c */
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
double* somefunction(
double* inputMatrix, int64_t d1_inputMatrix, int64_t d2_inputMatrix, int64_t h_inputMatrix, int64_t w_inputMatrix,
double* kernel, int64_t d1_kernel, int64_t d2_kernel, int64_t h_kernel, int64_t w_kernel,
int64_t* d1_out, int64_t* d2_out, int64_t* d3_out, int64_t* d4_out, int64_t* d5_out
)
{
*d1_out = d1_kernel;
*d2_out = d2_kernel;
*d3_out = d2_inputMatrix;
*d4_out = h_inputMatrix - h_kernel + 1;
*d5_out = w_inputMatrix - w_kernel + 1;
const size_t size = *d1_out * *d2_out * *d3_out * *d4_out * *d5_out;
double* result = malloc(size * sizeof(double));
if(result == NULL)
{
fprintf(stderr, "Unable to allocate an array of %d bytes", size * sizeof(double));
return NULL;
}
/* Some operation: fill `result` */
return result;
}
# On Windows
gcc utils.c -shared -o utils.dll
# On Linux
gcc utils.c -fPIC -shared -o libutils.so
-----------------------
import ctypes
import numpy as np
# Example of input
a = np.empty((16, 16, 12, 12), dtype=np.float64)
b = np.empty((8, 8, 4, 4), dtype=np.float64)
# Better than CDLL regarding the Numpy documentation.
# Here the DLL/SO file is found in:
# Windows: ".\utils.dll"
# Linux: "./libutils.so"
utils = np.ctypeslib.load_library('utils', '.')
INT = ctypes.c_int64
PINT = ctypes.POINTER(ctypes.c_int64)
PDOUBLE = ctypes.POINTER(ctypes.c_double)
ND_POINTER_4 = np.ctypeslib.ndpointer(dtype=np.float64, ndim=4, flags="C_CONTIGUOUS")
utils.somefunction.argtypes = [
ND_POINTER_4, INT, INT, INT, INT,
ND_POINTER_4, INT, INT, INT, INT,
PINT, PINT, PINT, PINT, PINT
]
utils.somefunction.restype = PDOUBLE
d1_out, d2_out, d3_out, d4_out, d5_out = INT(), INT(), INT(), INT(), INT()
p_d1_out = ctypes.pointer(d1_out)
p_d2_out = ctypes.pointer(d2_out)
p_d3_out = ctypes.pointer(d3_out)
p_d4_out = ctypes.pointer(d4_out)
p_d5_out = ctypes.pointer(d5_out)
out = utils.somefunction(a, a.shape[0], a.shape[1], a.shape[2], a.shape[3],
b, b.shape[0], b.shape[1], b.shape[2], b.shape[3],
p_d1_out, p_d2_out, p_d3_out, p_d4_out, p_d5_out)
d1_out = d1_out.value
d2_out = d2_out.value
d3_out = d3_out.value
d4_out = d4_out.value
d5_out = d5_out.value
result = np.ctypeslib.as_array(out, shape=(d1_out, d2_out, d3_out, d4_out, d5_out))
# Some operations
# WARNING:
# You should free the memory of the allocated buffer
# with `free(out)` when you are done with `result`
# since Numpy does not free it for you: it just creates
# a view and does not take the ownership.
# Note that the right libc must be used, otherwise the
# call to free will cause an undefined behaviour
# (eg. crash, error message, nothing)
/* utils.c */
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
double* somefunction(
double* inputMatrix, int64_t d1_inputMatrix, int64_t d2_inputMatrix, int64_t h_inputMatrix, int64_t w_inputMatrix,
double* kernel, int64_t d1_kernel, int64_t d2_kernel, int64_t h_kernel, int64_t w_kernel,
int64_t* d1_out, int64_t* d2_out, int64_t* d3_out, int64_t* d4_out, int64_t* d5_out
)
{
*d1_out = d1_kernel;
*d2_out = d2_kernel;
*d3_out = d2_inputMatrix;
*d4_out = h_inputMatrix - h_kernel + 1;
*d5_out = w_inputMatrix - w_kernel + 1;
const size_t size = *d1_out * *d2_out * *d3_out * *d4_out * *d5_out;
double* result = malloc(size * sizeof(double));
if(result == NULL)
{
fprintf(stderr, "Unable to allocate an array of %d bytes", size * sizeof(double));
return NULL;
}
/* Some operation: fill `result` */
return result;
}
# On Windows
gcc utils.c -shared -o utils.dll
# On Linux
gcc utils.c -fPIC -shared -o libutils.so
-----------------------
import ctypes
import numpy as np
# Example of input
a = np.empty((16, 16, 12, 12), dtype=np.float64)
b = np.empty((8, 8, 4, 4), dtype=np.float64)
# Better than CDLL regarding the Numpy documentation.
# Here the DLL/SO file is found in:
# Windows: ".\utils.dll"
# Linux: "./libutils.so"
utils = np.ctypeslib.load_library('utils', '.')
INT = ctypes.c_int64
PINT = ctypes.POINTER(ctypes.c_int64)
PDOUBLE = ctypes.POINTER(ctypes.c_double)
ND_POINTER_4 = np.ctypeslib.ndpointer(dtype=np.float64, ndim=4, flags="C_CONTIGUOUS")
utils.somefunction.argtypes = [
ND_POINTER_4, INT, INT, INT, INT,
ND_POINTER_4, INT, INT, INT, INT,
PINT, PINT, PINT, PINT, PINT
]
utils.somefunction.restype = PDOUBLE
d1_out, d2_out, d3_out, d4_out, d5_out = INT(), INT(), INT(), INT(), INT()
p_d1_out = ctypes.pointer(d1_out)
p_d2_out = ctypes.pointer(d2_out)
p_d3_out = ctypes.pointer(d3_out)
p_d4_out = ctypes.pointer(d4_out)
p_d5_out = ctypes.pointer(d5_out)
out = utils.somefunction(a, a.shape[0], a.shape[1], a.shape[2], a.shape[3],
b, b.shape[0], b.shape[1], b.shape[2], b.shape[3],
p_d1_out, p_d2_out, p_d3_out, p_d4_out, p_d5_out)
d1_out = d1_out.value
d2_out = d2_out.value
d3_out = d3_out.value
d4_out = d4_out.value
d5_out = d5_out.value
result = np.ctypeslib.as_array(out, shape=(d1_out, d2_out, d3_out, d4_out, d5_out))
# Some operations
# WARNING:
# You should free the memory of the allocated buffer
# with `free(out)` when you are done with `result`
# since Numpy does not free it for you: it just creates
# a view and does not take the ownership.
# Note that the right libc must be used, otherwise the
# call to free will cause an undefined behaviour
# (eg. crash, error message, nothing)
/* utils.c */
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
double* somefunction(
double* inputMatrix, int64_t d1_inputMatrix, int64_t d2_inputMatrix, int64_t h_inputMatrix, int64_t w_inputMatrix,
double* kernel, int64_t d1_kernel, int64_t d2_kernel, int64_t h_kernel, int64_t w_kernel,
int64_t* d1_out, int64_t* d2_out, int64_t* d3_out, int64_t* d4_out, int64_t* d5_out
)
{
*d1_out = d1_kernel;
*d2_out = d2_kernel;
*d3_out = d2_inputMatrix;
*d4_out = h_inputMatrix - h_kernel + 1;
*d5_out = w_inputMatrix - w_kernel + 1;
const size_t size = *d1_out * *d2_out * *d3_out * *d4_out * *d5_out;
double* result = malloc(size * sizeof(double));
if(result == NULL)
{
fprintf(stderr, "Unable to allocate an array of %d bytes", size * sizeof(double));
return NULL;
}
/* Some operation: fill `result` */
return result;
}
# On Windows
gcc utils.c -shared -o utils.dll
# On Linux
gcc utils.c -fPIC -shared -o libutils.so
Saving model on Tensorflow 2.7.0 with data augmentation layer
import tensorflow as tf
import numpy as np
class RandomColorDistortion(tf.keras.layers.Layer):
def __init__(self, contrast_range=[0.5, 1.5],
brightness_delta=[-0.2, 0.2], **kwargs):
super(RandomColorDistortion, self).__init__(**kwargs)
self.contrast_range = contrast_range
self.brightness_delta = brightness_delta
def call(self, images, training=None):
if not training:
return images
contrast = np.random.uniform(
self.contrast_range[0], self.contrast_range[1])
brightness = np.random.uniform(
self.brightness_delta[0], self.brightness_delta[1])
images = tf.image.adjust_contrast(images, contrast)
images = tf.image.adjust_brightness(images, brightness)
images = tf.clip_by_value(images, 0, 1)
return images
def get_config(self):
config = super(RandomColorDistortion, self).get_config()
config.update({"contrast_range": self.contrast_range, "brightness_delta": self.brightness_delta})
return config
input_shape_rgb = (256, 256, 3)
data_augmentation_rgb = tf.keras.Sequential(
[
tf.keras.layers.RandomFlip("horizontal"),
tf.keras.layers.RandomFlip("vertical"),
tf.keras.layers.RandomRotation(0.5),
tf.keras.layers.RandomZoom(0.5),
tf.keras.layers.RandomContrast(0.5),
RandomColorDistortion(name='random_contrast_brightness/none'),
]
)
input_shape = (256, 256, 3)
padding = 'same'
kernel_size = 3
model = tf.keras.Sequential([
tf.keras.layers.Input(input_shape),
data_augmentation_rgb,
tf.keras.layers.Rescaling((1./255)),
tf.keras.layers.Conv2D(16, kernel_size, padding=padding, activation='relu', strides=1,
data_format='channels_last'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(32, kernel_size, padding=padding, activation='relu'), # best 4
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(64, kernel_size, padding=padding, activation='relu'), # best 3
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(128, kernel_size, padding=padding, activation='relu'), # best 3
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(128, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(64, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(5, activation = 'softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
model.save("test", save_format='h5')
model = tf.keras.models.load_model('test.h5', custom_objects={'RandomColorDistortion': RandomColorDistortion})
-----------------------
import tensorflow as tf
import numpy as np
class RandomColorDistortion(tf.keras.layers.Layer):
def __init__(self, contrast_range=[0.5, 1.5],
brightness_delta=[-0.2, 0.2], **kwargs):
super(RandomColorDistortion, self).__init__(**kwargs)
self.contrast_range = contrast_range
self.brightness_delta = brightness_delta
def call(self, images, training=None):
if not training:
return images
contrast = np.random.uniform(
self.contrast_range[0], self.contrast_range[1])
brightness = np.random.uniform(
self.brightness_delta[0], self.brightness_delta[1])
images = tf.image.adjust_contrast(images, contrast)
images = tf.image.adjust_brightness(images, brightness)
images = tf.clip_by_value(images, 0, 1)
return images
def get_config(self):
config = super(RandomColorDistortion, self).get_config()
config.update({"contrast_range": self.contrast_range, "brightness_delta": self.brightness_delta})
return config
input_shape_rgb = (256, 256, 3)
data_augmentation_rgb = tf.keras.Sequential(
[
tf.keras.layers.RandomFlip("horizontal"),
tf.keras.layers.RandomFlip("vertical"),
tf.keras.layers.RandomRotation(0.5),
tf.keras.layers.RandomZoom(0.5),
tf.keras.layers.RandomContrast(0.5),
RandomColorDistortion(name='random_contrast_brightness/none'),
]
)
input_shape = (256, 256, 3)
padding = 'same'
kernel_size = 3
model = tf.keras.Sequential([
tf.keras.layers.Input(input_shape),
data_augmentation_rgb,
tf.keras.layers.Rescaling((1./255)),
tf.keras.layers.Conv2D(16, kernel_size, padding=padding, activation='relu', strides=1,
data_format='channels_last'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(32, kernel_size, padding=padding, activation='relu'), # best 4
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(64, kernel_size, padding=padding, activation='relu'), # best 3
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(128, kernel_size, padding=padding, activation='relu'), # best 3
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(128, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(64, activation='relu'), # best 1
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(5, activation = 'softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
model.save("test", save_format='h5')
model = tf.keras.models.load_model('test.h5', custom_objects={'RandomColorDistortion': RandomColorDistortion})
Why is creating a temp array with Stride illegal in an Array extension
let indexes = Array<Int>(stride(from: 0, to: count, by: chunkSize))
How strides help in traversing an array in numpy?
In [635]: x=np.arange(6,dtype='uint8')
In [636]: x
Out[636]: array([0, 1, 2, 3, 4, 5], dtype=uint8)
In [637]: x.strides
Out[637]: (1,)
In [638]: y=x.reshape(2,3)
In [639]: y
Out[639]:
array([[0, 1, 2],
[3, 4, 5]], dtype=uint8)
In [640]: y.strides
Out[640]: (3, 1)
In [641]: z = y.transpose()
In [642]: z
Out[642]:
array([[0, 3],
[1, 4],
[2, 5]], dtype=uint8)
In [643]: z.strides
Out[643]: (1, 3)
In [687]: for i in y:print(i)
[0 1 2]
[3 4 5]
In [688]: for i in z:print(i)
[0 3]
[1 4]
[2 5]
-----------------------
In [635]: x=np.arange(6,dtype='uint8')
In [636]: x
Out[636]: array([0, 1, 2, 3, 4, 5], dtype=uint8)
In [637]: x.strides
Out[637]: (1,)
In [638]: y=x.reshape(2,3)
In [639]: y
Out[639]:
array([[0, 1, 2],
[3, 4, 5]], dtype=uint8)
In [640]: y.strides
Out[640]: (3, 1)
In [641]: z = y.transpose()
In [642]: z
Out[642]:
array([[0, 3],
[1, 4],
[2, 5]], dtype=uint8)
In [643]: z.strides
Out[643]: (1, 3)
In [687]: for i in y:print(i)
[0 1 2]
[3 4 5]
In [688]: for i in z:print(i)
[0 3]
[1 4]
[2 5]
-----------------------
In [635]: x=np.arange(6,dtype='uint8')
In [636]: x
Out[636]: array([0, 1, 2, 3, 4, 5], dtype=uint8)
In [637]: x.strides
Out[637]: (1,)
In [638]: y=x.reshape(2,3)
In [639]: y
Out[639]:
array([[0, 1, 2],
[3, 4, 5]], dtype=uint8)
In [640]: y.strides
Out[640]: (3, 1)
In [641]: z = y.transpose()
In [642]: z
Out[642]:
array([[0, 3],
[1, 4],
[2, 5]], dtype=uint8)
In [643]: z.strides
Out[643]: (1, 3)
In [687]: for i in y:print(i)
[0 1 2]
[3 4 5]
In [688]: for i in z:print(i)
[0 3]
[1 4]
[2 5]
-----------------------
In [635]: x=np.arange(6,dtype='uint8')
In [636]: x
Out[636]: array([0, 1, 2, 3, 4, 5], dtype=uint8)
In [637]: x.strides
Out[637]: (1,)
In [638]: y=x.reshape(2,3)
In [639]: y
Out[639]:
array([[0, 1, 2],
[3, 4, 5]], dtype=uint8)
In [640]: y.strides
Out[640]: (3, 1)
In [641]: z = y.transpose()
In [642]: z
Out[642]:
array([[0, 3],
[1, 4],
[2, 5]], dtype=uint8)
In [643]: z.strides
Out[643]: (1, 3)
In [687]: for i in y:print(i)
[0 1 2]
[3 4 5]
In [688]: for i in z:print(i)
[0 3]
[1 4]
[2 5]
-----------------------
In [635]: x=np.arange(6,dtype='uint8')
In [636]: x
Out[636]: array([0, 1, 2, 3, 4, 5], dtype=uint8)
In [637]: x.strides
Out[637]: (1,)
In [638]: y=x.reshape(2,3)
In [639]: y
Out[639]:
array([[0, 1, 2],
[3, 4, 5]], dtype=uint8)
In [640]: y.strides
Out[640]: (3, 1)
In [641]: z = y.transpose()
In [642]: z
Out[642]:
array([[0, 3],
[1, 4],
[2, 5]], dtype=uint8)
In [643]: z.strides
Out[643]: (1, 3)
In [687]: for i in y:print(i)
[0 1 2]
[3 4 5]
In [688]: for i in z:print(i)
[0 3]
[1 4]
[2 5]
-----------------------
// Numpy array are not strongly typed internally.
// The raw memory buffer is always contiguous.
char* data = view.rawData;
const size_t iStride = view.stride[0];
const size_t jStride = view.stride[1];
const size_t kStride = view.stride[2];
for(int i=0 ; i<view.shape[0] ; ++i) {
for(int j=0 ; j<view.shape[1] ; ++j) {
for(int k=0 ; k<view.shape[2] ; ++k) {
// Compute the offset from the view strides
const size_t offset = iStride * i + jStride * j + kStride * k;
// Extract an item at the memory offset
Type item = (Type*)(data + offset);
// Do something with item here (eg. print it)
}
}
}
arr = np.arange(16).reshape((2, 2, 4))
arr.strides # (32, 16, 4)
view = arr.transpose((1, 0, 2))
view.strides # (16, 32, 4) <-- note that 16 and 32 have been swapped
-----------------------
// Numpy array are not strongly typed internally.
// The raw memory buffer is always contiguous.
char* data = view.rawData;
const size_t iStride = view.stride[0];
const size_t jStride = view.stride[1];
const size_t kStride = view.stride[2];
for(int i=0 ; i<view.shape[0] ; ++i) {
for(int j=0 ; j<view.shape[1] ; ++j) {
for(int k=0 ; k<view.shape[2] ; ++k) {
// Compute the offset from the view strides
const size_t offset = iStride * i + jStride * j + kStride * k;
// Extract an item at the memory offset
Type item = (Type*)(data + offset);
// Do something with item here (eg. print it)
}
}
}
arr = np.arange(16).reshape((2, 2, 4))
arr.strides # (32, 16, 4)
view = arr.transpose((1, 0, 2))
view.strides # (16, 32, 4) <-- note that 16 and 32 have been swapped
How to fit the model for multi input problem in keras
model.fit([trainX1, trainX2], [trainY1, trainY2],validation_data=([valX1,valX2],[valY1,valY2]), epochs=5, batch_size=32, verbose=1)
How to check the input dimensions of a model in Flux.jl?
julia> size(x[:, :, :, 1])
(3, 3, 16)
julia> size(x[:, :, :, 1:1])
(3, 3, 16, 1)
julia> size(Flux.unsqueeze(x[:, :, :, 1], ndims(x)))
(3, 3, 16, 1)
julia> all(Flux.unsqueeze(x[:, :, :, 1], ndims(x)) .== x[:, :, :, 1])
true
-----------------------
julia> size(x[:, :, :, 1])
(3, 3, 16)
julia> size(x[:, :, :, 1:1])
(3, 3, 16, 1)
julia> size(Flux.unsqueeze(x[:, :, :, 1], ndims(x)))
(3, 3, 16, 1)
julia> all(Flux.unsqueeze(x[:, :, :, 1], ndims(x)) .== x[:, :, :, 1])
true
-----------------------
julia> size(x[:, :, :, 1])
(3, 3, 16)
julia> size(x[:, :, :, 1:1])
(3, 3, 16, 1)
julia> size(Flux.unsqueeze(x[:, :, :, 1], ndims(x)))
(3, 3, 16, 1)
julia> all(Flux.unsqueeze(x[:, :, :, 1], ndims(x)) .== x[:, :, :, 1])
true
apply pixel shader with WinRT's Windows.Graphics.Capture
m_d3dContext->CopyResource(m_SharedSurf, frameSurface.get());
//*********************************************************
//
// Copyright (c) Microsoft. All rights reserved.
// This code is licensed under the MIT License (MIT).
// THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
// THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
//*********************************************************
#include "pch.h"
#include "SimpleCapture.h"
#include <windows.h>
#include <d3d11.h>
#include <dxgi1_2.h>
#include <sal.h>
#include <new>
#include <warning.h>
#include <DirectXMath.h>
#include "PixelShader.h"
using namespace winrt;
using namespace Windows::Foundation;
using namespace Windows::System;
using namespace Windows::Graphics::Capture;
using namespace Windows::Graphics::DirectX;
using namespace Windows::Graphics::DirectX::Direct3D11;
using namespace Windows::Foundation::Numerics;
using namespace Windows::UI;
using namespace Windows::UI::Composition;
using namespace DirectX;
typedef struct _VERTEX
{
DirectX::XMFLOAT3 Pos;
DirectX::XMFLOAT2 TexCoord;
} VERTEX;
//
// A vertex with a position and texture coordinate
//
SimpleCapture::SimpleCapture(
IDirect3DDevice const& device,
GraphicsCaptureItem const& item,
HWND const& drawingHandle)
{
m_item = item;
m_device = device;
m_WindowHandle = drawingHandle;
HRESULT hr = S_OK;
m_3dDevice = GetDXGIInterfaceFromObject<ID3D11Device>(m_device);
m_3dDevice->GetImmediateContext(m_d3dContext.put());
// Get DXGI factory
IDXGIDevice* DxgiDevice = nullptr;
hr = m_3dDevice->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&DxgiDevice));
if (FAILED(hr))
{
assert(false);
}
IDXGIAdapter* DxgiAdapter = nullptr;
hr = DxgiDevice->GetParent(__uuidof(IDXGIAdapter), reinterpret_cast<void**>(&DxgiAdapter));
DxgiDevice->Release();
DxgiDevice = nullptr;
if (FAILED(hr))
{
assert(false);
}
hr = DxgiAdapter->GetParent(__uuidof(IDXGIFactory2), reinterpret_cast<void**>(&m_Factory));
DxgiAdapter->Release();
DxgiAdapter = nullptr;
if (FAILED(hr))
{
assert(false);
}
// Get window size
RECT WindowRect;
GetClientRect(m_WindowHandle, &WindowRect);
UINT Width = WindowRect.right - WindowRect.left;
UINT Height = WindowRect.bottom - WindowRect.top;
// Create swapchain for window
DXGI_SWAP_CHAIN_DESC1 SwapChainDesc;
RtlZeroMemory(&SwapChainDesc, sizeof(SwapChainDesc));
SwapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
SwapChainDesc.BufferCount = 2;
SwapChainDesc.Width = Width;
SwapChainDesc.Height = Height;
SwapChainDesc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
SwapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
SwapChainDesc.SampleDesc.Count = 1;
SwapChainDesc.SampleDesc.Quality = 0;
hr = m_Factory->CreateSwapChainForHwnd(m_3dDevice.get(), m_WindowHandle, &SwapChainDesc, nullptr, nullptr, &m_SwapChain);
if (FAILED(hr))
{
assert(false);
}
// Disable the ALT-ENTER shortcut for entering full-screen mode
hr = m_Factory->MakeWindowAssociation(m_WindowHandle, DXGI_MWA_NO_ALT_ENTER);
if (FAILED(hr))
{
assert(false);
}
// Create shared texture
hr = CreateSharedSurf();
if (FAILED(hr))
{
assert(false);
}
// Make new render target view
hr = MakeRTV();
if (FAILED(hr))
{
assert(false);
}
// Set view port
hr = SetViewPort(Width, Height);
if (FAILED(hr))
{
assert(false);
}
// Create the sample state
D3D11_SAMPLER_DESC SampDesc;
RtlZeroMemory(&SampDesc, sizeof(SampDesc));
SampDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
SampDesc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.ComparisonFunc = D3D11_COMPARISON_NEVER;
SampDesc.MinLOD = 0;
SampDesc.MaxLOD = D3D11_FLOAT32_MAX;
hr = m_3dDevice->CreateSamplerState(&SampDesc, &m_SamplerLinear);
if (FAILED(hr))
{
assert(false);
}
// Create the blend state
D3D11_BLEND_DESC BlendStateDesc;
BlendStateDesc.AlphaToCoverageEnable = FALSE;
BlendStateDesc.IndependentBlendEnable = FALSE;
BlendStateDesc.RenderTarget[0].BlendEnable = TRUE;
BlendStateDesc.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA;
BlendStateDesc.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA;
BlendStateDesc.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD;
BlendStateDesc.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ONE;
BlendStateDesc.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO;
BlendStateDesc.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD;
BlendStateDesc.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;
hr = m_3dDevice->CreateBlendState(&BlendStateDesc, &m_BlendState);
if (FAILED(hr))
{
assert(false);
}
// Initialize shaders
hr = InitShaders();
if (FAILED(hr))
{
assert(false);
}
// END OF ADDED CHANGES
auto size = m_item.Size();
// Create framepool, define pixel format (DXGI_FORMAT_B8G8R8A8_UNORM), and frame size.
m_framePool = Direct3D11CaptureFramePool::Create(
m_device,
DirectXPixelFormat::B8G8R8A8UIntNormalized,
2,
size);
m_session = m_framePool.CreateCaptureSession(m_item);
m_frameArrived = m_framePool.FrameArrived(auto_revoke, { this, &SimpleCapture::OnFrameArrived });
}
//
// Initialize shaders for drawing to screen
//
HRESULT SimpleCapture::InitShaders()
{
HRESULT hr;
UINT Size = ARRAYSIZE(g_VS1);
hr = m_3dDevice->CreateVertexShader(g_VS1, Size, nullptr, &m_VertexShader);
if (FAILED(hr))
{
assert(false);
return hr;
}
D3D11_INPUT_ELEMENT_DESC Layout[] =
{
{"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}
};
UINT NumElements = ARRAYSIZE(Layout);
hr = m_3dDevice->CreateInputLayout(Layout, NumElements, g_VS1, Size, &m_InputLayout);
if (FAILED(hr))
{
assert(false);
return hr;
}
m_d3dContext->IASetInputLayout(m_InputLayout);
Size = ARRAYSIZE(g_main);
hr = m_3dDevice->CreatePixelShader(g_main, Size, nullptr, &m_PixelShader);
if (FAILED(hr))
{
assert(false);
return hr;
}
return S_OK;
}
HRESULT SimpleCapture::SetViewPort(UINT Width, UINT Height)
{
D3D11_VIEWPORT VP;
VP.Width = static_cast<FLOAT>(Width);
VP.Height = static_cast<FLOAT>(Height);
VP.MinDepth = 0.0f;
VP.MaxDepth = 1.0f;
VP.TopLeftX = 0;
VP.TopLeftY = 0;
m_d3dContext->RSSetViewports(1, &VP);
return S_OK;
}
//
// Reset render target view
//
HRESULT SimpleCapture::MakeRTV()
{
// Get backbuffer
ID3D11Texture2D* BackBuffer = nullptr;
HRESULT hr = m_SwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&BackBuffer));
if (FAILED(hr))
{
assert(false);
return hr;
}
// Create a render target view
hr = m_3dDevice->CreateRenderTargetView(BackBuffer, nullptr, &m_RTV);
BackBuffer->Release();
if (FAILED(hr))
{
assert(false);
return hr;
}
// Set new render target
m_d3dContext->OMSetRenderTargets(1, &m_RTV, nullptr);
return S_OK;
}
//
// Recreate shared texture
//
HRESULT SimpleCapture::CreateSharedSurf()
{
auto size = m_item.Size();
RECT magWindowRect;
GetClientRect(m_WindowHandle, &magWindowRect);
// Create shared texture for all duplication threads to draw into
D3D11_TEXTURE2D_DESC DeskTexD;
RtlZeroMemory(&DeskTexD, sizeof(D3D11_TEXTURE2D_DESC));
DeskTexD.Width = static_cast<uint32_t>(size.Width);
DeskTexD.Height = static_cast<uint32_t>(size.Height);
DeskTexD.MipLevels = 1;
DeskTexD.ArraySize = 1;
DeskTexD.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
DeskTexD.SampleDesc.Count = 1;
DeskTexD.Usage = D3D11_USAGE_DEFAULT;
DeskTexD.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
DeskTexD.CPUAccessFlags = 0;
DeskTexD.MiscFlags = 0;
HRESULT hr = m_3dDevice->CreateTexture2D(&DeskTexD, nullptr, &m_SharedSurf);
if (FAILED(hr))
{
assert(false);
return hr;
}
return S_OK;
}
//
//
// Start sending capture frames
void SimpleCapture::StartCapture(HWND drawingHandle)
{
CheckClosed();
m_session.StartCapture();
}
ICompositionSurface SimpleCapture::CreateSurface(
Compositor const& compositor)
{
CheckClosed();
return CreateCompositionSurfaceForSwapChain(compositor, m_SwapChain);
}
// Process captured frames
void SimpleCapture::Close()
{
auto expected = false;
if (m_closed.compare_exchange_strong(expected, true))
{
m_frameArrived.revoke();
m_framePool.Close();
m_session.Close();
m_SwapChain = nullptr;
m_framePool = nullptr;
m_session = nullptr;
m_item = nullptr;
}
}
void SimpleCapture::OnFrameArrived(
Direct3D11CaptureFramePool const& sender,
winrt::Windows::Foundation::IInspectable const&)
{
auto newSize = false;
{
auto frame = sender.TryGetNextFrame();
auto frameContentSize = frame.ContentSize();
auto frameSurface = GetDXGIInterfaceFromObject<ID3D11Texture2D>(frame.Surface());
m_d3dContext->CopyResource(m_SharedSurf, frameSurface.get());
// Vertices for drawing whole texture
VERTEX Vertices[NUMVERTICES] =
{
{XMFLOAT3(-1.0f, -1.0f, 0), XMFLOAT2(0.0f, 1.0f)},
{XMFLOAT3(-1.0f, 1.0f, 0), XMFLOAT2(0.0f, 0.0f)},
{XMFLOAT3(1.0f, -1.0f, 0), XMFLOAT2(1.0f, 1.0f)},
{XMFLOAT3(1.0f, -1.0f, 0), XMFLOAT2(1.0f, 1.0f)},
{XMFLOAT3(-1.0f, 1.0f, 0), XMFLOAT2(0.0f, 0.0f)},
{XMFLOAT3(1.0f, 1.0f, 0), XMFLOAT2(1.0f, 0.0f)},
};
D3D11_TEXTURE2D_DESC FrameDesc;
m_SharedSurf->GetDesc(&FrameDesc);
D3D11_SHADER_RESOURCE_VIEW_DESC ShaderDesc;
ShaderDesc.Format = FrameDesc.Format;
ShaderDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
ShaderDesc.Texture2D.MostDetailedMip = FrameDesc.MipLevels - 1;
ShaderDesc.Texture2D.MipLevels = FrameDesc.MipLevels;
// Create new shader resource view
ID3D11ShaderResourceView* ShaderResource = nullptr;
HRESULT hr = m_3dDevice->CreateShaderResourceView(m_SharedSurf, &ShaderDesc, &ShaderResource);
if (FAILED(hr))
{
assert(false);
}
// Set resources
UINT Stride = sizeof(VERTEX);
UINT Offset = 0;
FLOAT blendFactor[4] = { 0.f, 0.f, 0.f, 0.f };
m_d3dContext->OMSetBlendState(nullptr, blendFactor, 0xffffffff);
m_d3dContext->OMSetRenderTargets(1, &m_RTV, nullptr);
m_d3dContext->VSSetShader(m_VertexShader, nullptr, 0);
m_d3dContext->PSSetShader(m_PixelShader, nullptr, 0);
m_d3dContext->PSSetShaderResources(0, 1, &ShaderResource);
m_d3dContext->PSSetSamplers(0, 1, &m_SamplerLinear);
m_d3dContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
D3D11_BUFFER_DESC BufferDesc;
RtlZeroMemory(&BufferDesc, sizeof(BufferDesc));
BufferDesc.Usage = D3D11_USAGE_DEFAULT;
BufferDesc.ByteWidth = sizeof(VERTEX) * NUMVERTICES;
BufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
BufferDesc.CPUAccessFlags = 0;
D3D11_SUBRESOURCE_DATA InitData;
RtlZeroMemory(&InitData, sizeof(InitData));
InitData.pSysMem = Vertices;
ID3D11Buffer* VertexBuffer = nullptr;
// Create vertex buffer
hr = m_3dDevice->CreateBuffer(&BufferDesc, &InitData, &VertexBuffer);
if (FAILED(hr))
{
assert(false);
ShaderResource->Release();
ShaderResource = nullptr;
}
m_d3dContext->IASetVertexBuffers(0, 1, &VertexBuffer, &Stride, &Offset);
m_d3dContext->Draw(NUMVERTICES, 0);
VertexBuffer->Release();
VertexBuffer = nullptr;
// Release shader resource
ShaderResource->Release();
ShaderResource = nullptr;
DXGI_PRESENT_PARAMETERS presentParameters = { 0 };
m_SwapChain->Present1(1, 0, &presentParameters);
}
}
-----------------------
m_d3dContext->CopyResource(m_SharedSurf, frameSurface.get());
//*********************************************************
//
// Copyright (c) Microsoft. All rights reserved.
// This code is licensed under the MIT License (MIT).
// THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
// THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
//*********************************************************
#include "pch.h"
#include "SimpleCapture.h"
#include <windows.h>
#include <d3d11.h>
#include <dxgi1_2.h>
#include <sal.h>
#include <new>
#include <warning.h>
#include <DirectXMath.h>
#include "PixelShader.h"
using namespace winrt;
using namespace Windows::Foundation;
using namespace Windows::System;
using namespace Windows::Graphics::Capture;
using namespace Windows::Graphics::DirectX;
using namespace Windows::Graphics::DirectX::Direct3D11;
using namespace Windows::Foundation::Numerics;
using namespace Windows::UI;
using namespace Windows::UI::Composition;
using namespace DirectX;
typedef struct _VERTEX
{
DirectX::XMFLOAT3 Pos;
DirectX::XMFLOAT2 TexCoord;
} VERTEX;
//
// A vertex with a position and texture coordinate
//
SimpleCapture::SimpleCapture(
IDirect3DDevice const& device,
GraphicsCaptureItem const& item,
HWND const& drawingHandle)
{
m_item = item;
m_device = device;
m_WindowHandle = drawingHandle;
HRESULT hr = S_OK;
m_3dDevice = GetDXGIInterfaceFromObject<ID3D11Device>(m_device);
m_3dDevice->GetImmediateContext(m_d3dContext.put());
// Get DXGI factory
IDXGIDevice* DxgiDevice = nullptr;
hr = m_3dDevice->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&DxgiDevice));
if (FAILED(hr))
{
assert(false);
}
IDXGIAdapter* DxgiAdapter = nullptr;
hr = DxgiDevice->GetParent(__uuidof(IDXGIAdapter), reinterpret_cast<void**>(&DxgiAdapter));
DxgiDevice->Release();
DxgiDevice = nullptr;
if (FAILED(hr))
{
assert(false);
}
hr = DxgiAdapter->GetParent(__uuidof(IDXGIFactory2), reinterpret_cast<void**>(&m_Factory));
DxgiAdapter->Release();
DxgiAdapter = nullptr;
if (FAILED(hr))
{
assert(false);
}
// Get window size
RECT WindowRect;
GetClientRect(m_WindowHandle, &WindowRect);
UINT Width = WindowRect.right - WindowRect.left;
UINT Height = WindowRect.bottom - WindowRect.top;
// Create swapchain for window
DXGI_SWAP_CHAIN_DESC1 SwapChainDesc;
RtlZeroMemory(&SwapChainDesc, sizeof(SwapChainDesc));
SwapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
SwapChainDesc.BufferCount = 2;
SwapChainDesc.Width = Width;
SwapChainDesc.Height = Height;
SwapChainDesc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
SwapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
SwapChainDesc.SampleDesc.Count = 1;
SwapChainDesc.SampleDesc.Quality = 0;
hr = m_Factory->CreateSwapChainForHwnd(m_3dDevice.get(), m_WindowHandle, &SwapChainDesc, nullptr, nullptr, &m_SwapChain);
if (FAILED(hr))
{
assert(false);
}
// Disable the ALT-ENTER shortcut for entering full-screen mode
hr = m_Factory->MakeWindowAssociation(m_WindowHandle, DXGI_MWA_NO_ALT_ENTER);
if (FAILED(hr))
{
assert(false);
}
// Create shared texture
hr = CreateSharedSurf();
if (FAILED(hr))
{
assert(false);
}
// Make new render target view
hr = MakeRTV();
if (FAILED(hr))
{
assert(false);
}
// Set view port
hr = SetViewPort(Width, Height);
if (FAILED(hr))
{
assert(false);
}
// Create the sample state
D3D11_SAMPLER_DESC SampDesc;
RtlZeroMemory(&SampDesc, sizeof(SampDesc));
SampDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
SampDesc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
SampDesc.ComparisonFunc = D3D11_COMPARISON_NEVER;
SampDesc.MinLOD = 0;
SampDesc.MaxLOD = D3D11_FLOAT32_MAX;
hr = m_3dDevice->CreateSamplerState(&SampDesc, &m_SamplerLinear);
if (FAILED(hr))
{
assert(false);
}
// Create the blend state
D3D11_BLEND_DESC BlendStateDesc;
BlendStateDesc.AlphaToCoverageEnable = FALSE;
BlendStateDesc.IndependentBlendEnable = FALSE;
BlendStateDesc.RenderTarget[0].BlendEnable = TRUE;
BlendStateDesc.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA;
BlendStateDesc.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA;
BlendStateDesc.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD;
BlendStateDesc.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ONE;
BlendStateDesc.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO;
BlendStateDesc.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD;
BlendStateDesc.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;
hr = m_3dDevice->CreateBlendState(&BlendStateDesc, &m_BlendState);
if (FAILED(hr))
{
assert(false);
}
// Initialize shaders
hr = InitShaders();
if (FAILED(hr))
{
assert(false);
}
// END OF ADDED CHANGES
auto size = m_item.Size();
// Create framepool, define pixel format (DXGI_FORMAT_B8G8R8A8_UNORM), and frame size.
m_framePool = Direct3D11CaptureFramePool::Create(
m_device,
DirectXPixelFormat::B8G8R8A8UIntNormalized,
2,
size);
m_session = m_framePool.CreateCaptureSession(m_item);
m_frameArrived = m_framePool.FrameArrived(auto_revoke, { this, &SimpleCapture::OnFrameArrived });
}
//
// Initialize shaders for drawing to screen
//
HRESULT SimpleCapture::InitShaders()
{
HRESULT hr;
UINT Size = ARRAYSIZE(g_VS1);
hr = m_3dDevice->CreateVertexShader(g_VS1, Size, nullptr, &m_VertexShader);
if (FAILED(hr))
{
assert(false);
return hr;
}
D3D11_INPUT_ELEMENT_DESC Layout[] =
{
{"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}
};
UINT NumElements = ARRAYSIZE(Layout);
hr = m_3dDevice->CreateInputLayout(Layout, NumElements, g_VS1, Size, &m_InputLayout);
if (FAILED(hr))
{
assert(false);
return hr;
}
m_d3dContext->IASetInputLayout(m_InputLayout);
Size = ARRAYSIZE(g_main);
hr = m_3dDevice->CreatePixelShader(g_main, Size, nullptr, &m_PixelShader);
if (FAILED(hr))
{
assert(false);
return hr;
}
return S_OK;
}
HRESULT SimpleCapture::SetViewPort(UINT Width, UINT Height)
{
D3D11_VIEWPORT VP;
VP.Width = static_cast<FLOAT>(Width);
VP.Height = static_cast<FLOAT>(Height);
VP.MinDepth = 0.0f;
VP.MaxDepth = 1.0f;
VP.TopLeftX = 0;
VP.TopLeftY = 0;
m_d3dContext->RSSetViewports(1, &VP);
return S_OK;
}
//
// Reset render target view
//
HRESULT SimpleCapture::MakeRTV()
{
// Get backbuffer
ID3D11Texture2D* BackBuffer = nullptr;
HRESULT hr = m_SwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&BackBuffer));
if (FAILED(hr))
{
assert(false);
return hr;
}
// Create a render target view
hr = m_3dDevice->CreateRenderTargetView(BackBuffer, nullptr, &m_RTV);
BackBuffer->Release();
if (FAILED(hr))
{
assert(false);
return hr;
}
// Set new render target
m_d3dContext->OMSetRenderTargets(1, &m_RTV, nullptr);
return S_OK;
}
//
// Recreate shared texture
//
HRESULT SimpleCapture::CreateSharedSurf()
{
auto size = m_item.Size();
RECT magWindowRect;
GetClientRect(m_WindowHandle, &magWindowRect);
// Create shared texture for all duplication threads to draw into
D3D11_TEXTURE2D_DESC DeskTexD;
RtlZeroMemory(&DeskTexD, sizeof(D3D11_TEXTURE2D_DESC));
DeskTexD.Width = static_cast<uint32_t>(size.Width);
DeskTexD.Height = static_cast<uint32_t>(size.Height);
DeskTexD.MipLevels = 1;
DeskTexD.ArraySize = 1;
DeskTexD.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
DeskTexD.SampleDesc.Count = 1;
DeskTexD.Usage = D3D11_USAGE_DEFAULT;
DeskTexD.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
DeskTexD.CPUAccessFlags = 0;
DeskTexD.MiscFlags = 0;
HRESULT hr = m_3dDevice->CreateTexture2D(&DeskTexD, nullptr, &m_SharedSurf);
if (FAILED(hr))
{
assert(false);
return hr;
}
return S_OK;
}
//
//
// Start sending capture frames
void SimpleCapture::StartCapture(HWND drawingHandle)
{
CheckClosed();
m_session.StartCapture();
}
ICompositionSurface SimpleCapture::CreateSurface(
Compositor const& compositor)
{
CheckClosed();
return CreateCompositionSurfaceForSwapChain(compositor, m_SwapChain);
}
// Process captured frames
void SimpleCapture::Close()
{
auto expected = false;
if (m_closed.compare_exchange_strong(expected, true))
{
m_frameArrived.revoke();
m_framePool.Close();
m_session.Close();
m_SwapChain = nullptr;
m_framePool = nullptr;
m_session = nullptr;
m_item = nullptr;
}
}
void SimpleCapture::OnFrameArrived(
Direct3D11CaptureFramePool const& sender,
winrt::Windows::Foundation::IInspectable const&)
{
auto newSize = false;
{
auto frame = sender.TryGetNextFrame();
auto frameContentSize = frame.ContentSize();
auto frameSurface = GetDXGIInterfaceFromObject<ID3D11Texture2D>(frame.Surface());
m_d3dContext->CopyResource(m_SharedSurf, frameSurface.get());
// Vertices for drawing whole texture
VERTEX Vertices[NUMVERTICES] =
{
{XMFLOAT3(-1.0f, -1.0f, 0), XMFLOAT2(0.0f, 1.0f)},
{XMFLOAT3(-1.0f, 1.0f, 0), XMFLOAT2(0.0f, 0.0f)},
{XMFLOAT3(1.0f, -1.0f, 0), XMFLOAT2(1.0f, 1.0f)},
{XMFLOAT3(1.0f, -1.0f, 0), XMFLOAT2(1.0f, 1.0f)},
{XMFLOAT3(-1.0f, 1.0f, 0), XMFLOAT2(0.0f, 0.0f)},
{XMFLOAT3(1.0f, 1.0f, 0), XMFLOAT2(1.0f, 0.0f)},
};
D3D11_TEXTURE2D_DESC FrameDesc;
m_SharedSurf->GetDesc(&FrameDesc);
D3D11_SHADER_RESOURCE_VIEW_DESC ShaderDesc;
ShaderDesc.Format = FrameDesc.Format;
ShaderDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
ShaderDesc.Texture2D.MostDetailedMip = FrameDesc.MipLevels - 1;
ShaderDesc.Texture2D.MipLevels = FrameDesc.MipLevels;
// Create new shader resource view
ID3D11ShaderResourceView* ShaderResource = nullptr;
HRESULT hr = m_3dDevice->CreateShaderResourceView(m_SharedSurf, &ShaderDesc, &ShaderResource);
if (FAILED(hr))
{
assert(false);
}
// Set resources
UINT Stride = sizeof(VERTEX);
UINT Offset = 0;
FLOAT blendFactor[4] = { 0.f, 0.f, 0.f, 0.f };
m_d3dContext->OMSetBlendState(nullptr, blendFactor, 0xffffffff);
m_d3dContext->OMSetRenderTargets(1, &m_RTV, nullptr);
m_d3dContext->VSSetShader(m_VertexShader, nullptr, 0);
m_d3dContext->PSSetShader(m_PixelShader, nullptr, 0);
m_d3dContext->PSSetShaderResources(0, 1, &ShaderResource);
m_d3dContext->PSSetSamplers(0, 1, &m_SamplerLinear);
m_d3dContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
D3D11_BUFFER_DESC BufferDesc;
RtlZeroMemory(&BufferDesc, sizeof(BufferDesc));
BufferDesc.Usage = D3D11_USAGE_DEFAULT;
BufferDesc.ByteWidth = sizeof(VERTEX) * NUMVERTICES;
BufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
BufferDesc.CPUAccessFlags = 0;
D3D11_SUBRESOURCE_DATA InitData;
RtlZeroMemory(&InitData, sizeof(InitData));
InitData.pSysMem = Vertices;
ID3D11Buffer* VertexBuffer = nullptr;
// Create vertex buffer
hr = m_3dDevice->CreateBuffer(&BufferDesc, &InitData, &VertexBuffer);
if (FAILED(hr))
{
assert(false);
ShaderResource->Release();
ShaderResource = nullptr;
}
m_d3dContext->IASetVertexBuffers(0, 1, &VertexBuffer, &Stride, &Offset);
m_d3dContext->Draw(NUMVERTICES, 0);
VertexBuffer->Release();
VertexBuffer = nullptr;
// Release shader resource
ShaderResource->Release();
ShaderResource = nullptr;
DXGI_PRESENT_PARAMETERS presentParameters = { 0 };
m_SwapChain->Present1(1, 0, &presentParameters);
}
}
QUESTION
variadic template: SFINAE on last argument
Asked 2022-Mar-11 at 11:03I have an array (of any rank), and I would like to have an index operator that:
Allows for missing indices, such that the following is equivalent
a(1, 0, 0, 0);
a(1, my::missing);
This in itself if straightforward (see example implementation below): one just recursively adds arg * strides[dim]
until my::missing
it hit.
Allows automatic prepending of zeros, such that the following is equivalent
a(0, 0, 0, 1);
a(1);
Also this is not hard (see example implementation below): one recursively adds arg * strides[dim + offset]
.
What I cannot get my head around is: How to combine the two? The implementation of 2. makes me start of the wrong foot for 1. (I'm limited to <= C++14)
enum class my { missing };
template <size_t dim, class S>
inline size_t index_impl(const S&) noexcept
{
return 0;
}
template <size_t dim, class S, class... Args>
inline size_t index_impl(const S& strides, enum my arg, Args... args) noexcept
{
return 0;
}
template <size_t dim, class S, class Arg, class... Args>
inline size_t index_impl(const S& strides, Arg arg, Args... args) noexcept
{
return arg * strides[dim] + index_impl<dim + 1>(strides, args...);
}
template <class S, class Arg, class... Args>
inline size_t index(const S& strides, Arg arg, Args... args)
{
return index_impl<0>(strides, arg, args...);
}
int main()
{
std::vector<size_t> strides = {8, 4, 2 ,1};
std::cout << index(strides, 1, 2, 0, 0) << std::endl;
std::cout << index(strides, 1, 2, my::missing) << std::endl;
}
template <size_t dim, class S>
inline size_t index_impl(const S&) noexcept
{
return 0;
}
template <size_t dim, class S, class Arg, class... Args>
inline size_t index_impl(const S& strides, Arg arg, Args... args) noexcept
{
return arg * strides[dim] + index_impl<dim + 1>(strides, args...);
}
template <class S, class Arg, class... Args>
inline size_t index(const S& strides, Arg arg, Args... args)
{
constexpr size_t nargs = sizeof...(Args) + 1;
if (nargs == strides.size())
{
return index_impl<0>(strides, arg, args...);
}
else if (nargs < strides.size())
{
return index_impl<0>(strides.cend() - nargs, arg, args...);
}
return index_impl<0>(strides, arg, args...);
}
int main()
{
std::vector<size_t> strides = {8, 4, 2 ,1};
std::cout << index(strides, 1, 2) << std::endl;
std::cout << index(strides, 0, 0, 1, 2) << std::endl;
}
ANSWER
Answered 2022-Mar-11 at 11:03In earlier version of this answer I didn't provided full implementation since something was not adding up for me.
If this index
should calculate index for flattened multidimensional array then your example implementation is invalid. Problem is hidden since you are comparing two results for index
with all indexes provided and shorten version where zero padding is assumed.
Sadly I flowed this pattern in first versions of test in Catch2.
Here is proper test for index of flattened multidimensional array, where last index matches flattened index:
TEST_CASE("index")
{
std::vector<size_t> strides = { 4, 6, 3, 5 };
SECTION("Padding with leading zeros")
{
constexpr auto i0 = 4;
constexpr auto i1 = 2;
constexpr size_t expected = i0 + i1 * 5;
CHECK(index(strides, 0, 0, i1, i0) == expected);
CHECK(index(strides, 0, 0, i1, i0 - 1) == expected - 1); // last index indexes by one
CHECK(index(strides, i1, i0) == expected);
CHECK(index(strides, i1, i0 - 1) == expected - 1);
}
SECTION("Use my::missing to use padding with tailing zeros")
{
constexpr auto i2 = 4;
constexpr auto i3 = 2;
constexpr size_t expected = (i3 * 6 + i2) * 5 * 3;
CHECK(index(strides, i3, i2, 0, 0) == expected);
CHECK(index(strides, i3, i2, my::missing) == expected);
}
}
Now starting from your code and passing those test I've got this implementation:
template <typename T, typename... Ts>
struct last_type_helper {
using type = typename last_type_helper<Ts...>::type;
};
template <typename T>
struct last_type_helper<T> {
using type = T;
};
template <typename... Ts>
using last_type = typename last_type_helper<Ts...>::type;
enum class my { missing };
template <typename... Ts>
constexpr bool LastTypeIsMy = std::is_same<my, last_type<Ts...>>::value;
template <class StrideIter>
size_t index_impl(size_t base, StrideIter)
{
return base;
}
template <class StrideIter>
size_t index_impl(size_t base, StrideIter, my)
{
return base;
}
template <class StrideIter, typename Tn, typename... Ts>
size_t index_impl(size_t base, StrideIter it, Tn xn, Ts... x)
{
return index_impl(base * *it + xn, it + 1, x...);
}
template <class S, class... Args>
size_t index(const S& strides, Args... args)
{
const size_t offset = strides.size() - sizeof...(Args);
const size_t advenceBy = LastTypeIsMy<Args...> ? 0 : offset;
const size_t lastStrides = LastTypeIsMy<Args...> ? offset + 1 : 0;
const size_t tailFactor = std::accumulate(std::end(strides) - lastStrides, std::end(strides),
size_t { 1 }, std::multiplies<> {});
return index_impl(0, std::begin(strides) + advenceBy, args...) * tailFactor;
}
Here is live demo (passing tests).
Community Discussions, Code Snippets contain sources that include Stack Exchange Network
Save this library and start creating your kit
Explore Related Topics
Save this library and start creating your kit