[FFmpeg-devel] [PATCH] ppc: replace vec_ld(0), vec_ld(1) by VEC_LD2() which has fewer loads
Michael Niedermayer
michaelni at gmx.at
Fri Nov 7 15:12:19 CET 2014
This needs to be benchmarked, i do not have ppc hw
This is on big endian more similar to how the code was before 79e0255956bc8fcdb143f39b2e45db77144ac017
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
libavcodec/ppc/hpeldsp_altivec.c | 30 ++++++++++--------------------
libavutil/ppc/util_altivec.h | 16 ++++++++++++++++
2 files changed, 26 insertions(+), 20 deletions(-)
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05..05d8b81 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -123,8 +123,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
- pixelsv1 = VEC_LD(0, pixels);
- pixelsv2 = VEC_LD(1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
@@ -136,8 +135,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
- pixelsv1 = unaligned_load(line_size, pixels);
- pixelsv2 = unaligned_load(line_size+1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -171,8 +169,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
- pixelsv1 = VEC_LD(0, pixels);
- pixelsv2 = VEC_LD(1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
@@ -183,8 +180,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
- pixelsv1 = unaligned_load(line_size, pixels);
- pixelsv2 = unaligned_load(line_size+1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -218,8 +214,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
- pixelsv1 = VEC_LD(0, pixels);
- pixelsv2 = VEC_LD(1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
@@ -234,8 +229,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
for (i = 0; i < h ; i++) {
blockv = vec_ld(0, block);
- pixelsv1 = unaligned_load(line_size, pixels);
- pixelsv2 = unaligned_load(line_size+1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
@@ -274,8 +268,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
- pixelsv1 = VEC_LD(0, pixels);
- pixelsv2 = VEC_LD(1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
@@ -288,8 +281,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
pixelssum1 = vec_add(pixelssum1, vcone);
for (i = 0; i < h ; i++) {
- pixelsv1 = unaligned_load(line_size, pixels);
- pixelsv2 = unaligned_load(line_size+1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
@@ -329,8 +321,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
register const vector unsigned short vctwo = (const vector unsigned short)
vec_splat_u16(2);
- pixelsv1 = VEC_LD(0, pixels);
- pixelsv2 = VEC_LD(1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
@@ -341,8 +332,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
- pixelsv1 = unaligned_load(line_size, pixels);
- pixelsv2 = unaligned_load(line_size+1, pixels);
+ VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
diff --git a/libavutil/ppc/util_altivec.h b/libavutil/ppc/util_altivec.h
index fd3bfd3..9fda566 100644
--- a/libavutil/ppc/util_altivec.h
+++ b/libavutil/ppc/util_altivec.h
@@ -88,9 +88,25 @@ do { \
#if HAVE_BIGENDIAN
#define VEC_LD(offset,b) \
vec_perm(vec_ld(offset, b), vec_ld(offset+15, b), vec_lvsl(offset, b))
+
+#define VEC_LD2(dst1, dst2, offset, b) do { \
+ register vector unsigned char temp1 = vec_ld(offset , b); \
+ register vector unsigned char temp2 = vec_ld((offset) + 16, b); \
+ (dst1) = vec_perm(temp1, temp2, vec_lvsl(offset, b)); \
+ if ((((unsigned long)(b + (offset))) & 0x0000000F) == 0x0000000F) { \
+ (dst2) = temp2; \
+ } else { \
+ (dst2) = vec_perm(temp1, temp2, vec_lvsl((offset)+1, b)); \
+ } \
+ } while(0)
#else
#define VEC_LD(offset,b) \
vec_vsx_ld(offset, b)
+
+#define VEC_LD2(dst1, dst2, offset, b) do { \
+ (dst1) = VEC_LD(offset ,b); \
+ (dst2) = VEC_LD((offset)+1,b); \
+ } while(0)
#endif
/** @brief loads unaligned vector @a *src with offset @a offset
--
1.7.9.5
More information about the ffmpeg-devel
mailing list