Quantcast
Channel: Intel® C++-Compiler
Viewing all articles
Browse latest Browse all 1665

ICC 15.0 generates slower AVX codes than SSSE3 codes

$
0
0
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int matrix_rot(uint8_t* src, uint8_t* des, int w, int h) {
	for (int y = 0; y < h; ++y) {
		const int stride = y * w + h;
		for (int x = 0; x < w; ++x) {
			des[x*h+y] = src[stride-x];
		}
	}
	return 0;
}

int matrix_dump(uint8_t* buf, int w, int h) {
	for (int y = 0; y < h; y++) {
		for (int x = 0; x < w; x++) {
			printf("%d ", buf[y*w+x]);
		}
		printf("\n");
	}
	return 0;
}

int main(int argc, char** argv) {
	if (argc != 3) return -1;
	int w = strtol(argv[1], NULL, 10);
	int h = strtol(argv[2], NULL, 10);

	uint8_t *bmp = malloc(w*h);
	uint8_t *dst = malloc(w*h);

	srand(12);
	int c = rand();
	printf("%d\n", c);
	for (int i = 0; i < c; i++) {
		matrix_rot(bmp, dst, w, h);
		matrix_rot(bmp, dst, w, h);
	}

	matrix_dump(bmp, 4, 3);
	matrix_dump(dst, 3, 4);
	free(bmp);
	free(dst);

	return 0;
}

ICC 15.0 generates slower AVX/SSE4.2 codes than SSSE3 codes.

The SSSE3 compilation command line and result:

$ icc -std=c99 -xSSSE3 -O3 -o rot rot.c
$ time ./rot 320 240
201684
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0
0 0 0
0 0 0
0 0 0
./rot 320 240  10.06s user 0.01s system 99% cpu 10.077 total

The AVX compilation command line and result:

$ icc -std=c99 -xSSE4.2 -O3 -o rot rot.c
$ time ./rot 320 240
201684
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0
0 0 0
0 0 0
0 0 0
./rot 320 240  16.64s user 0.00s system 99% cpu 16.650 total

As you can see, AVX version runs 60% slower than the SSSE3 version.


Viewing all articles
Browse latest Browse all 1665

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>