#include <stdio.h> #include <stdlib.h> #include <string.h> int matrix_rot(uint8_t* src, uint8_t* des, int w, int h) { for (int y = 0; y < h; ++y) { const int stride = y * w + h; for (int x = 0; x < w; ++x) { des[x*h+y] = src[stride-x]; } } return 0; } int matrix_dump(uint8_t* buf, int w, int h) { for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { printf("%d ", buf[y*w+x]); } printf("\n"); } return 0; } int main(int argc, char** argv) { if (argc != 3) return -1; int w = strtol(argv[1], NULL, 10); int h = strtol(argv[2], NULL, 10); uint8_t *bmp = malloc(w*h); uint8_t *dst = malloc(w*h); srand(12); int c = rand(); printf("%d\n", c); for (int i = 0; i < c; i++) { matrix_rot(bmp, dst, w, h); matrix_rot(bmp, dst, w, h); } matrix_dump(bmp, 4, 3); matrix_dump(dst, 3, 4); free(bmp); free(dst); return 0; }
ICC 15.0 generates slower AVX/SSE4.2 codes than SSSE3 codes.
The SSSE3 compilation command line and result:
$ icc -std=c99 -xSSSE3 -O3 -o rot rot.c $ time ./rot 320 240 201684 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ./rot 320 240 10.06s user 0.01s system 99% cpu 10.077 total
The AVX compilation command line and result:
$ icc -std=c99 -xSSE4.2 -O3 -o rot rot.c $ time ./rot 320 240 201684 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ./rot 320 240 16.64s user 0.00s system 99% cpu 16.650 total
As you can see, AVX version runs 60% slower than the SSSE3 version.