/home/fresvfqn/.cagefs/tmp/phptrIRNy
ELF>�@@
GCC: (GNU) 8.5.0 20210514 (Red Hat 8.5.0-28)GNU�GA$3a1�	crtstuff.c__FRAME_END____TMC_END__.symtab.strtab.shstrtab.text.data.bss.eh_frame.tm_clone_table.comment.note.GNU-stack.note.gnu.property.rela.gnu.build.attributes@!@'@,@6HF0H.Ov_x w�$r@ 0	�8	�&P�# This spec file is read by gcc when linking.  It is used to specify the
# standard libraries we need in order to link with various sanitizer libs.

*link_libasan: -lrt -ldl -lrt -lpthread -lm

*link_libtsan: -lrt -ldl -lrt -lpthread -lm

*link_libubsan: -ldl -lrt -lpthread -lm

*link_liblsan: -ldl -lrt -lpthread -lm

/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 10.0.  */

#ifndef _SMMINTRIN_H_INCLUDED
#define _SMMINTRIN_H_INCLUDED

/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
   files.  */
#include <tmmintrin.h>

#ifndef __SSE4_1__
#pragma GCC push_options
#pragma GCC target("sse4.1")
#define __DISABLE_SSE4_1__
#endif /* __SSE4_1__ */

/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT	0x00
#define _MM_FROUND_TO_NEG_INF		0x01
#define _MM_FROUND_TO_POS_INF		0x02
#define _MM_FROUND_TO_ZERO		0x03
#define _MM_FROUND_CUR_DIRECTION	0x04

#define _MM_FROUND_RAISE_EXC		0x00
#define _MM_FROUND_NO_EXC		0x08

#define _MM_FROUND_NINT		\
  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_FLOOR	\
  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_CEIL		\
  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_TRUNC	\
  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_RINT		\
  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NEARBYINT	\
  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)

/* Test Instruction */
/* Packed integer 128-bit bitwise comparison. Return 1 if
   (__V & __M) == 0.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_si128 (__m128i __M, __m128i __V)
{
  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
}

/* Packed integer 128-bit bitwise comparison. Return 1 if
   (__V & ~__M) == 0.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_si128 (__m128i __M, __m128i __V)
{
  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
}

/* Packed integer 128-bit bitwise comparison. Return 1 if
   (__V & __M) != 0 && (__V & ~__M) != 0.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_si128 (__m128i __M, __m128i __V)
{
  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
}

/* Macros for packed integer 128-bit comparison intrinsics.  */
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))

#define _mm_test_all_ones(V) \
  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))

#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))

/* Packed/scalar double precision floating point rounding.  */

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_pd (__m128d __V, const int __M)
{
  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_sd(__m128d __D, __m128d __V, const int __M)
{
  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
					   (__v2df)__V,
					   __M);
}
#else
#define _mm_round_pd(V, M) \
  ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))

#define _mm_round_sd(D, V, M)						\
  ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D),		\
				     (__v2df)(__m128d)(V), (int)(M)))
#endif

/* Packed/scalar single precision floating point rounding.  */

#ifdef __OPTIMIZE__
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ps (__m128 __V, const int __M)
{
  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ss (__m128 __D, __m128 __V, const int __M)
{
  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
					  (__v4sf)__V,
					  __M);
}
#else
#define _mm_round_ps(V, M) \
  ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))

#define _mm_round_ss(D, V, M)						\
  ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D),		\
				    (__v4sf)(__m128)(V), (int)(M)))
#endif

/* Macros for ceil/floor intrinsics.  */
#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)

#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)

#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)

#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)

/* SSE4.1 */

/* Integer blend instructions - select data from 2 sources using
   constant/variable mask.  */

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
{
  return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
					      (__v8hi)__Y,
					      __M);
}
#else
#define _mm_blend_epi16(X, Y, M)					\
  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X),		\
					(__v8hi)(__m128i)(Y), (int)(M)))
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
{
  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
					       (__v16qi)__Y,
					       (__v16qi)__M);
}

/* Single precision floating point blend instructions - select data
   from 2 sources using constant/variable mask.  */

#ifdef __OPTIMIZE__
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
{
  return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
					  (__v4sf)__Y,
					  __M);
}
#else
#define _mm_blend_ps(X, Y, M)						\
  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X),		\
				    (__v4sf)(__m128)(Y), (int)(M)))
#endif

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
{
  return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
					   (__v4sf)__Y,
					   (__v4sf)__M);
}

/* Double precision floating point blend instructions - select data
   from 2 sources using constant/variable mask.  */

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
{
  return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
					   (__v2df)__Y,
					   __M);
}
#else
#define _mm_blend_pd(X, Y, M)						\
  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X),		\
				     (__v2df)(__m128d)(Y), (int)(M)))
#endif

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
{
  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
					    (__v2df)__Y,
					    (__v2df)__M);
}

/* Dot product instructions with mask-defined summing and zeroing parts
   of result.  */

#ifdef __OPTIMIZE__
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
{
  return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
				       (__v4sf)__Y,
				       __M);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
{
  return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
					(__v2df)__Y,
					__M);
}
#else
#define _mm_dp_ps(X, Y, M)						\
  ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X),			\
				 (__v4sf)(__m128)(Y), (int)(M)))

#define _mm_dp_pd(X, Y, M)						\
  ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X),			\
				  (__v2df)(__m128d)(Y), (int)(M)))
#endif

/* Packed integer 64-bit comparison, zeroing or filling with ones
   corresponding parts of result.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
{
  return (__m128i) ((__v2di)__X == (__v2di)__Y);
}

/*  Min/max packed integer instructions.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi8 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi8 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
}

/* Packed integer 32-bit multiplication with truncation of upper
   halves of results.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) ((__v4su)__X * (__v4su)__Y);
}

/* Packed integer 32-bit multiplication of 2 pairs of operands
   with two 64-bit results.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
}

/* Insert single precision float into packed single precision array
   element selected by index N.  The bits [7-6] of N define S
   index, the bits [5-4] define D index, and bits [3-0] define
   zeroing mask for D.  */

#ifdef __OPTIMIZE__
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
{
  return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
					      (__v4sf)__S,
					      __N);
}
#else
#define _mm_insert_ps(D, S, N)						\
  ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D),		\
					(__v4sf)(__m128)(S), (int)(N)))
#endif

/* Helper macro to create the N value for _mm_insert_ps.  */
#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))

/* Extract binary representation of single precision float from packed
   single precision array element of X selected by index N.  */

#ifdef __OPTIMIZE__
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_ps (__m128 __X, const int __N)
{
  union { int i; float f; } __tmp;
  __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
  return __tmp.i;
}
#else
#define _mm_extract_ps(X, N)						\
  (__extension__							\
   ({									\
     union { int i; float f; } __tmp;					\
     __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \
     __tmp.i;								\
   }))
#endif

/* Extract binary representation of single precision float into
   D from packed single precision array element of S selected
   by index N.  */
#define _MM_EXTRACT_FLOAT(D, S, N) \
  { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
  
/* Extract specified single precision float element into the lower
   part of __m128.  */
#define _MM_PICK_OUT_PS(X, N)				\
  _mm_insert_ps (_mm_setzero_ps (), (X), 		\
		 _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))

/* Insert integer, S, into packed integer array element of D
   selected by index N.  */

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi8 (__m128i __D, int __S, const int __N)
{
  return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
						 __S, __N);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi32 (__m128i __D, int __S, const int __N)
{
  return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
						 __S, __N);
}

#ifdef __x86_64__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
{
  return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
						 __S, __N);
}
#endif
#else
#define _mm_insert_epi8(D, S, N)					\
  ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D),	\
					   (int)(S), (int)(N)))

#define _mm_insert_epi32(D, S, N)				\
  ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D),	\
					  (int)(S), (int)(N)))

#ifdef __x86_64__
#define _mm_insert_epi64(D, S, N)					\
  ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D),		\
					  (long long)(S), (int)(N)))
#endif
#endif

/* Extract integer from packed integer array element of X selected by
   index N.  */

#ifdef __OPTIMIZE__
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi8 (__m128i __X, const int __N)
{
   return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi32 (__m128i __X, const int __N)
{
   return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
}

#ifdef __x86_64__
extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi64 (__m128i __X, const int __N)
{
  return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
}
#endif
#else
#define _mm_extract_epi8(X, N) \
  ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
#define _mm_extract_epi32(X, N) \
  ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))

#ifdef __x86_64__
#define _mm_extract_epi64(X, N) \
  ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
#endif
#endif

/* Return horizontal packed word minimum and its index in bits [15:0]
   and bits [18:16] respectively.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_minpos_epu16 (__m128i __X)
{
  return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
}

/* Packed integer sign-extension.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi32 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi32 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi16 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
}

/* Packed integer zero-extension. */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi32 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi32 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu32_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi64 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi16 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
}

/* Pack 8 double words from 2 operands into 8 words of result with
   unsigned saturation. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
}

/* Sum absolute 8-bit integer difference of adjacent groups of 4
   byte integers in the first 2 operands.  Starting offsets within
   operands are determined by the 3rd mask operand.  */

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
{
  return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
					      (__v16qi)__Y, __M);
}
#else
#define _mm_mpsadbw_epu8(X, Y, M)					\
  ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X),		\
					(__v16qi)(__m128i)(Y), (int)(M)))
#endif

/* Load double quadword using non-temporal aligned hint.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_load_si128 (__m128i *__X)
{
  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
}

#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_2__ */

/* These macros specify the source data format.  */
#define _SIDD_UBYTE_OPS			0x00
#define _SIDD_UWORD_OPS			0x01
#define _SIDD_SBYTE_OPS			0x02
#define _SIDD_SWORD_OPS			0x03

/* These macros specify the comparison operation.  */
#define _SIDD_CMP_EQUAL_ANY		0x00
#define _SIDD_CMP_RANGES		0x04
#define _SIDD_CMP_EQUAL_EACH		0x08
#define _SIDD_CMP_EQUAL_ORDERED		0x0c

/* These macros specify the polarity.  */
#define _SIDD_POSITIVE_POLARITY		0x00
#define _SIDD_NEGATIVE_POLARITY		0x10
#define _SIDD_MASKED_POSITIVE_POLARITY	0x20
#define _SIDD_MASKED_NEGATIVE_POLARITY	0x30

/* These macros specify the output selection in _mm_cmpXstri ().  */
#define _SIDD_LEAST_SIGNIFICANT		0x00
#define _SIDD_MOST_SIGNIFICANT		0x40

/* These macros specify the output selection in _mm_cmpXstrm ().  */
#define _SIDD_BIT_MASK			0x00
#define _SIDD_UNIT_MASK			0x40

/* Intrinsics for text/string processing.  */

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
{
  return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
						(__v16qi)__Y,
						__M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
				      (__v16qi)__Y,
				      __M);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
						(__v16qi)__Y, __LY,
						__M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
				      (__v16qi)__Y, __LY,
				      __M);
}
#else
#define _mm_cmpistrm(X, Y, M)						\
  ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X),	\
					  (__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistri(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X),		\
				      (__v16qi)(__m128i)(Y), (int)(M)))

#define _mm_cmpestrm(X, LX, Y, LY, M)					\
  ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X),	\
					  (int)(LX), (__v16qi)(__m128i)(Y), \
					  (int)(LY), (int)(M)))
#define _mm_cmpestri(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX),	\
				      (__v16qi)(__m128i)(Y), (int)(LY),	\
				      (int)(M)))
#endif

/* Intrinsics for text/string processing and reading values of
   EFlags.  */

#ifdef __OPTIMIZE__
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
				       (__v16qi)__Y,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
				       (__v16qi)__Y,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
				       (__v16qi)__Y,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
				       (__v16qi)__Y,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
{
  return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
				       (__v16qi)__Y,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
				       (__v16qi)__Y, __LY,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
				       (__v16qi)__Y, __LY,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
				       (__v16qi)__Y, __LY,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
				       (__v16qi)__Y, __LY,
				       __M);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
{
  return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
				       (__v16qi)__Y, __LY,
				       __M);
}
#else
#define _mm_cmpistra(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X),		\
				       (__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrc(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X),		\
				       (__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistro(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X),		\
				       (__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrs(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X),		\
				       (__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrz(X, Y, M)						\
  ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X),		\
				       (__v16qi)(__m128i)(Y), (int)(M)))

#define _mm_cmpestra(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
				       (__v16qi)(__m128i)(Y), (int)(LY), \
				       (int)(M)))
#define _mm_cmpestrc(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
				       (__v16qi)(__m128i)(Y), (int)(LY), \
				       (int)(M)))
#define _mm_cmpestro(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
				       (__v16qi)(__m128i)(Y), (int)(LY), \
				       (int)(M)))
#define _mm_cmpestrs(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
				       (__v16qi)(__m128i)(Y), (int)(LY), \
				       (int)(M)))
#define _mm_cmpestrz(X, LX, Y, LY, M)					\
  ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
				       (__v16qi)(__m128i)(Y), (int)(LY), \
				       (int)(M)))
#endif

/* Packed integer 64-bit comparison, zeroing or filling with ones
   corresponding parts of result.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
{
  return (__m128i) ((__v2di)__X > (__v2di)__Y);
}

#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */

#ifdef __DISABLE_SSE4_1__
#undef __DISABLE_SSE4_1__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_1__ */

#include <popcntintrin.h>

#ifndef __SSE4_1__
#pragma GCC push_options
#pragma GCC target("sse4.1")
#define __DISABLE_SSE4_1__
#endif /* __SSE4_1__ */

#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_1__ */

/* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u8 (unsigned int __C, unsigned char __V)
{
  return __builtin_ia32_crc32qi (__C, __V);
}

extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u16 (unsigned int __C, unsigned short __V)
{
  return __builtin_ia32_crc32hi (__C, __V);
}

extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u32 (unsigned int __C, unsigned int __V)
{
  return __builtin_ia32_crc32si (__C, __V);
}

#ifdef __x86_64__
extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
{
  return __builtin_ia32_crc32di (__C, __V);
}
#endif

#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */

#ifdef __DISABLE_SSE4_1__
#undef __DISABLE_SSE4_1__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_1__ */

#endif /* _SMMINTRIN_H_INCLUDED */
/* Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _GFNIINTRIN_H_INCLUDED
#define _GFNIINTRIN_H_INCLUDED

#if !defined(__GFNI__) || !defined(__SSE2__)
#pragma GCC push_options
#pragma GCC target("gfni,sse2")
#define __DISABLE_GFNI__
#endif /* __GFNI__ */

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8mul_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
						   (__v16qi) __B);
}

#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
{
  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A,
							   (__v16qi) __B,
							    __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
{
  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A,
							(__v16qi) __B, __C);
}
#else
#define _mm_gf2p8affineinv_epi64_epi8(A, B, C)				   \
  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
					   (__v16qi)(__m128i)(B), (int)(C)))
#define _mm_gf2p8affine_epi64_epi8(A, B, C)				   \
  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A),   \
					   (__v16qi)(__m128i)(B), (int)(C)))
#endif

#ifdef __DISABLE_GFNI__
#undef __DISABLE_GFNI__
#pragma GCC pop_options
#endif /* __DISABLE_GFNI__ */

#if !defined(__GFNI__) || !defined(__AVX__)
#pragma GCC push_options
#pragma GCC target("gfni,avx")
#define __DISABLE_GFNIAVX__
#endif /* __GFNIAVX__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A,
						    (__v32qi) __B);
}

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
{
  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A,
							   (__v32qi) __B,
							    __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
{
  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A,
							(__v32qi) __B, __C);
}
#else
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C)			   \
  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
						    (__v32qi)(__m256i)(B), \
						    (int)(C)))
#define _mm256_gf2p8affine_epi64_epi8(A, B, C)				   \
  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A),   \
					(   __v32qi)(__m256i)(B), (int)(C)))
#endif

#ifdef __DISABLE_GFNIAVX__
#undef __DISABLE_GFNIAVX__
#pragma GCC pop_options
#endif /* __GFNIAVX__ */

#if !defined(__GFNI__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512vl")
#define __DISABLE_GFNIAVX512VL__
#endif /* __GFNIAVX512VL__ */

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_gf2p8mul_epi8 (__m128i __A, __mmask16 __B, __m128i __C, __m128i __D)
{
  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __C,
							 (__v16qi) __D,
							 (__v16qi)__A, __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B,
			(__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A);
}

#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
				    __m128i __D, const int __E)
{
  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C,
								(__v16qi) __D,
								 __E,
								(__v16qi)__A,
								 __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
				     const int __D)
{
  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B,
						(__v16qi) __C, __D,
						(__v16qi) _mm_setzero_si128 (),
						 __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
				 __m128i __D, const int __E)
{
  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C,
					(__v16qi) __D, __E, (__v16qi)__A, __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
				  const int __D)
{
  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B,
		     (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A);
}
#else
#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) 		   \
  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask(		   \
			(__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D),      \
			(int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask(		   \
			(__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C),	   \
			(int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), \
			(__mmask16)(A)))
#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C),\
      (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D)			    \
  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B),\
		(__v16qi)(__m128i)(C), (int)(D),			    \
		(__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A)))
#endif

#ifdef __DISABLE_GFNIAVX512VL__
#undef __DISABLE_GFNIAVX512VL__
#pragma GCC pop_options
#endif /* __GFNIAVX512VL__ */

#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512vl,avx512bw")
#define __DISABLE_GFNIAVX512VLBW__
#endif /* __GFNIAVX512VLBW__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8mul_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
			   __m256i __D)
{
  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __C,
							 (__v32qi) __D,
							 (__v32qi)__A, __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B,
			(__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A);
}

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B,
				       __m256i __C, __m256i __D, const int __E)
{
  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C,
								(__v32qi) __D,
							 	 __E,
								(__v32qi)__A,
								 __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B,
					__m256i __C, const int __D)
{
  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B,
				      (__v32qi) __C, __D,
				      (__v32qi) _mm256_setzero_si256 (), __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
				    __m256i __D, const int __E)
{
  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C,
							     (__v32qi) __D,
							      __E,
							     (__v32qi)__A,
							      __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B,
				     __m256i __C, const int __D)
{
  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B,
		(__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A);
}
#else
#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E)		\
  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask(		\
	(__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E),		\
	(__v32qi)(__m256i)(A), (__mmask32)(B)))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D)		\
  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask(		\
	(__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D),		\
	(__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) 		    \
  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C),\
	(__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B)))
#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D)			    \
  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B),\
	 (__v32qi)(__m256i)(C), (int)(D),				    \
	 (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
#endif

#ifdef __DISABLE_GFNIAVX512VLBW__
#undef __DISABLE_GFNIAVX512VLBW__
#pragma GCC pop_options
#endif /* __GFNIAVX512VLBW__ */

#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512f,avx512bw")
#define __DISABLE_GFNIAVX512FBW__
#endif /* __GFNIAVX512FBW__ */

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8mul_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
			   __m512i __D)
{
  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __C,
					(__v64qi) __D, (__v64qi)__A, __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C)
{
  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B,
			(__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A,
						    (__v64qi) __B);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
				       __m512i __D, const int __E)
{
  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C,
								(__v64qi) __D,
								 __E,
								(__v64qi)__A,
								 __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B,
					__m512i __C, const int __D)
{
  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B,
				(__v64qi) __C, __D,
				(__v64qi) _mm512_setzero_si512 (), __A);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
{
  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A,
							   (__v64qi) __B, __C);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
				    __m512i __D, const int __E)
{
  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C,
					(__v64qi) __D, __E, (__v64qi)__A, __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C,
				     const int __D)
{
  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B,
		  (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
{
  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A,
							(__v64qi) __B, __C);
}
#else
#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) 		\
  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask(		\
	(__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E),		\
	(__v64qi)(__m512i)(A), (__mmask64)(B)))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D)		\
  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask(		\
	(__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D),		\
	(__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C)			\
  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi (			\
	(__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E)		    \
  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C),\
     (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B)))
#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D)			    \
  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B),\
	 (__v64qi)(__m512i)(C), (int)(D),				    \
	 (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
#define _mm512_gf2p8affine_epi64_epi8(A, B, C)				    \
  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A),    \
	 (__v64qi)(__m512i)(B), (int)(C)))
#endif

#ifdef __DISABLE_GFNIAVX512FBW__
#undef __DISABLE_GFNIAVX512FBW__
#pragma GCC pop_options
#endif /* __GFNIAVX512FBW__ */

#endif /* _GFNIINTRIN_H_INCLUDED */
/* Copyright (C) 1989-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
 * ISO C Standard:  7.15  Variable arguments  <stdarg.h>
 */

#ifndef _STDARG_H
#ifndef _ANSI_STDARG_H_
#ifndef __need___va_list
#define _STDARG_H
#define _ANSI_STDARG_H_
#endif /* not __need___va_list */
#undef __need___va_list

/* Define __gnuc_va_list.  */

#ifndef __GNUC_VA_LIST
#define __GNUC_VA_LIST
typedef __builtin_va_list __gnuc_va_list;
#endif

/* Define the standard macros for the user,
   if this invocation was from the user program.  */
#ifdef _STDARG_H

#define va_start(v,l)	__builtin_va_start(v,l)
#define va_end(v)	__builtin_va_end(v)
#define va_arg(v,l)	__builtin_va_arg(v,l)
#if !defined(__STRICT_ANSI__) || __STDC_VERSION__ + 0 >= 199900L \
    || __cplusplus + 0 >= 201103L
#define va_copy(d,s)	__builtin_va_copy(d,s)
#endif
#define __va_copy(d,s)	__builtin_va_copy(d,s)

/* Define va_list, if desired, from __gnuc_va_list. */
/* We deliberately do not define va_list when called from
   stdio.h, because ANSI C says that stdio.h is not supposed to define
   va_list.  stdio.h needs to have access to that data type, 
   but must not use that name.  It should use the name __gnuc_va_list,
   which is safe because it is reserved for the implementation.  */

#ifdef _BSD_VA_LIST
#undef _BSD_VA_LIST
#endif

#if defined(__svr4__) || (defined(_SCO_DS) && !defined(__VA_LIST))
/* SVR4.2 uses _VA_LIST for an internal alias for va_list,
   so we must avoid testing it and setting it here.
   SVR4 uses _VA_LIST as a flag in stdarg.h, but we should
   have no conflict with that.  */
#ifndef _VA_LIST_
#define _VA_LIST_
#ifdef __i860__
#ifndef _VA_LIST
#define _VA_LIST va_list
#endif
#endif /* __i860__ */
typedef __gnuc_va_list va_list;
#ifdef _SCO_DS
#define __VA_LIST
#endif
#endif /* _VA_LIST_ */
#else /* not __svr4__ || _SCO_DS */

/* The macro _VA_LIST_ is the same thing used by this file in Ultrix.
   But on BSD NET2 we must not test or define or undef it.
   (Note that the comments in NET 2's ansi.h
   are incorrect for _VA_LIST_--see stdio.h!)  */
#if !defined (_VA_LIST_) || defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__) || defined(WINNT)
/* The macro _VA_LIST_DEFINED is used in Windows NT 3.5  */
#ifndef _VA_LIST_DEFINED
/* The macro _VA_LIST is used in SCO Unix 3.2.  */
#ifndef _VA_LIST
/* The macro _VA_LIST_T_H is used in the Bull dpx2  */
#ifndef _VA_LIST_T_H
/* The macro __va_list__ is used by BeOS.  */
#ifndef __va_list__
typedef __gnuc_va_list va_list;
#endif /* not __va_list__ */
#endif /* not _VA_LIST_T_H */
#endif /* not _VA_LIST */
#endif /* not _VA_LIST_DEFINED */
#if !(defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__))
#define _VA_LIST_
#endif
#ifndef _VA_LIST
#define _VA_LIST
#endif
#ifndef _VA_LIST_DEFINED
#define _VA_LIST_DEFINED
#endif
#ifndef _VA_LIST_T_H
#define _VA_LIST_T_H
#endif
#ifndef __va_list__
#define __va_list__
#endif

#endif /* not _VA_LIST_, except on certain systems */

#endif /* not __svr4__ */

#endif /* _STDARG_H */

#endif /* not _ANSI_STDARG_H_ */
#endif /* not _STDARG_H */
/* Copyright (C) 2015-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _IMMINTRIN_H_INCLUDED
# error "Never use <cetintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _CETINTRIN_H_INCLUDED
#define _CETINTRIN_H_INCLUDED

#ifndef __SHSTK__
#pragma GCC push_options
#pragma GCC target ("shstk")
#define __DISABLE_SHSTK__
#endif /* __SHSTK__ */

#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_get_ssp (void)
{
  return __builtin_ia32_rdsspq ();
}
#else
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_get_ssp (void)
{
  return __builtin_ia32_rdsspd ();
}
#endif

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_inc_ssp (unsigned int __B)
{
#ifdef __x86_64__
  __builtin_ia32_incsspq ((unsigned long long) __B);
#else
  __builtin_ia32_incsspd (__B);
#endif
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_saveprevssp (void)
{
  __builtin_ia32_saveprevssp ();
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rstorssp (void *__B)
{
  __builtin_ia32_rstorssp (__B);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrssd (unsigned int __B, void *__C)
{
  __builtin_ia32_wrssd (__B, __C);
}

#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrssq (unsigned long long __B, void *__C)
{
  __builtin_ia32_wrssq (__B, __C);
}
#endif

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrussd (unsigned int __B, void *__C)
{
  __builtin_ia32_wrussd (__B, __C);
}

#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrussq (unsigned long long __B, void *__C)
{
  __builtin_ia32_wrussq (__B, __C);
}
#endif

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_setssbsy (void)
{
  __builtin_ia32_setssbsy ();
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_clrssbsy (void *__B)
{
  __builtin_ia32_clrssbsy (__B);
}

#ifdef __DISABLE_SHSTK__
#undef __DISABLE_SHSTK__
#pragma GCC pop_options
#endif /* __DISABLE_SHSTK__ */

#endif /* _CETINTRIN_H_INCLUDED.  */
/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _FMA4INTRIN_H_INCLUDED
#define _FMA4INTRIN_H_INCLUDED

/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files.  */
#include <ammintrin.h>

#ifndef __FMA4__
#pragma GCC push_options
#pragma GCC target("fma4")
#define __DISABLE_FMA4__
#endif /* __FMA4__ */

/* 128b Floating point multiply/add type instructions.  */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)

{
  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
{
  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
{
  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}

/* 256b Floating point multiply/add type instructions.  */
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
{
  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)

{
  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
{
  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
{
  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
{
  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
{
  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
{
  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}

#ifdef __DISABLE_FMA4__
#undef __DISABLE_FMA4__
#pragma GCC pop_options
#endif /* __DISABLE_FMA4__ */

#endif
/* Copyright (C) 2003-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.0.  */

#ifndef _EMMINTRIN_H_INCLUDED
#define _EMMINTRIN_H_INCLUDED

/* We need definitions from the SSE header files*/
#include <xmmintrin.h>

#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */

/* SSE2 */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
typedef signed char __v16qs __attribute__ ((__vector_size__ (16)));
typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));

/* Unaligned version of the same types.  */
typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));

/* Create a selector for use with the SHUFPD instruction.  */
#define _MM_SHUFFLE2(fp1,fp0) \
 (((fp1) << 1) | (fp0))

/* Create a vector with element 0 as F and the rest zero.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd (double __F)
{
  return __extension__ (__m128d){ __F, 0.0 };
}

/* Create a vector with both elements equal to F.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd (double __F)
{
  return __extension__ (__m128d){ __F, __F };
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1 (double __F)
{
  return _mm_set1_pd (__F);
}

/* Create a vector with the lower value X and upper value W.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd (double __W, double __X)
{
  return __extension__ (__m128d){ __X, __W };
}

/* Create a vector with the lower value W and upper value X.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd (double __W, double __X)
{
  return __extension__ (__m128d){ __W, __X };
}

/* Create an undefined vector.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd (void)
{
  __m128d __Y = __Y;
  return __Y;
}

/* Create a vector of zeros.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd (void)
{
  return __extension__ (__m128d){ 0.0, 0.0 };
}

/* Sets the low DPFP value of A from the low value of B.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
}

/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd (double const *__P)
{
  return *(__m128d *)__P;
}

/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd (double const *__P)
{
  return *(__m128d_u *)__P;
}

/* Create a vector with all two elements equal to *P.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd (double const *__P)
{
  return _mm_set1_pd (*__P);
}

/* Create a vector with element 0 as *P and the rest zero.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd (double const *__P)
{
  return _mm_set_sd (*__P);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1 (double const *__P)
{
  return _mm_load1_pd (__P);
}

/* Load two DPFP values in reverse order.  The address must be aligned.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd (double const *__P)
{
  __m128d __tmp = _mm_load_pd (__P);
  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
}

/* Store two DPFP values.  The address must be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd (double *__P, __m128d __A)
{
  *(__m128d *)__P = __A;
}

/* Store two DPFP values.  The address need not be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd (double *__P, __m128d __A)
{
  *(__m128d_u *)__P = __A;
}

/* Stores the lower DPFP value.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd (double *__P, __m128d __A)
{
  *__P = ((__v2df)__A)[0];
}

extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64 (__m128d __A)
{
  return ((__v2df)__A)[0];
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd (double *__P, __m128d __A)
{
  _mm_store_sd (__P, __A);
}

/* Stores the upper DPFP value.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd (double *__P, __m128d __A)
{
  *__P = ((__v2df)__A)[1];
}

/* Store the lower DPFP value across two words.
   The address must be 16-byte aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd (double *__P, __m128d __A)
{
  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1 (double *__P, __m128d __A)
{
  _mm_store1_pd (__P, __A);
}

/* Store two DPFP values in reverse order.  The address must be aligned.  */
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd (double *__P, __m128d __A)
{
  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32 (__m128i __A)
{
  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
}

#ifdef __x86_64__
/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64 (__m128i __A)
{
  return ((__v2di)__A)[0];
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
  return ((__v2di)__A)[0];
}
#endif

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A + (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A - (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A * (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
  return (__m128d) ((__v2df)__A / (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd (__m128d __A)
{
  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
}

/* Return pair {sqrt (B[0]), A[1]}.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
					 (__v2df)
					 __builtin_ia32_cmpltsd ((__v2df) __B,
								 (__v2df)
								 __A));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
					 (__v2df)
					 __builtin_ia32_cmplesd ((__v2df) __B,
								 (__v2df)
								 __A));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
					 (__v2df)
					 __builtin_ia32_cmpnltsd ((__v2df) __B,
								  (__v2df)
								  __A));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
					 (__v2df)
					 __builtin_ia32_cmpnlesd ((__v2df) __B,
								  (__v2df)
								  __A));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
}

/* Create a vector of Qi, where i is the element number.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x (long long __q1, long long __q0)
{
  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64 (__m64 __q1,  __m64 __q0)
{
  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{
  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
	       short __q3, short __q2, short __q1, short __q0)
{
  return __extension__ (__m128i)(__v8hi){
    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
	      char __q11, char __q10, char __q09, char __q08,
	      char __q07, char __q06, char __q05, char __q04,
	      char __q03, char __q02, char __q01, char __q00)
{
  return __extension__ (__m128i)(__v16qi){
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
  };
}

/* Set all of the elements of the vector to A.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x (long long __A)
{
  return _mm_set_epi64x (__A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64 (__m64 __A)
{
  return _mm_set_epi64 (__A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32 (int __A)
{
  return _mm_set_epi32 (__A, __A, __A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16 (short __A)
{
  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8 (char __A)
{
  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
		       __A, __A, __A, __A, __A, __A, __A, __A);
}

/* Create a vector of Qi, where i is the element number.
   The parameter order is reversed from the _mm_set_epi* functions.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64 (__m64 __q0, __m64 __q1)
{
  return _mm_set_epi64 (__q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
{
  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
	        short __q4, short __q5, short __q6, short __q7)
{
  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
	       char __q04, char __q05, char __q06, char __q07,
	       char __q08, char __q09, char __q10, char __q11,
	       char __q12, char __q13, char __q14, char __q15)
{
  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
}

/* Create a vector with element 0 as *P and the rest zero.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
  return *__P;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128 (__m128i_u const *__P)
{
  return *__P;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64 (__m128i_u const *__P)
{
  return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
  *__P = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
{
  *__P = __B;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
{
  *(__m64_u *)__P = (__m64) ((__v2di)__B)[0];
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64 (__m128i __B)
{
  return (__m64) ((__v2di)__B)[0];
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64 (__m64 __A)
{
  return _mm_set_epi64 ((__m64)0LL, __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64 (__m128i __A)
{
  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
}

/* Create an undefined vector.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128 (void)
{
  __m128i __Y = __Y;
  return __Y;
}

/* Create a vector of zeros.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128 (void)
{
  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd (__m128i __A)
{
  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps (__m128i __A)
{
  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32 (__m128d __A)
{
  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32 (__m128d __A)
{
  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps (__m128d __A)
{
  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32 (__m128d __A)
{
  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32 (__m128d __A)
{
  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd (__m64 __A)
{
  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32 (__m128 __A)
{
  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32 (__m128 __A)
{
  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd (__m128 __A)
{
  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32 (__m128d __A)
{
  return __builtin_ia32_cvtsd2si ((__v2df) __A);
}

#ifdef __x86_64__
/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64 (__m128d __A)
{
  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x (__m128d __A)
{
  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
}
#endif

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32 (__m128d __A)
{
  return __builtin_ia32_cvttsd2si ((__v2df) __A);
}

#ifdef __x86_64__
/* Intel intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64 (__m128d __A)
{
  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
}

/* Microsoft intrinsic.  */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x (__m128d __A)
{
  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
}
#endif

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss (__m128 __A, __m128d __B)
{
  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd (__m128d __A, int __B)
{
  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
}

#ifdef __x86_64__
/* Intel intrinsic.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd (__m128d __A, long long __B)
{
  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
}

/* Microsoft intrinsic.  */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd (__m128d __A, long long __B)
{
  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
}
#endif

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd (__m128d __A, __m128 __B)
{
  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
}

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
{
  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
}
#else
#define _mm_shuffle_pd(A, B, N)						\
  ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),		\
				   (__v2df)(__m128d)(B), (int)(N)))
#endif

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd (__m128d __A, double const *__B)
{
  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd (__m128d __A, double const *__B)
{
  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
  return __builtin_ia32_movmskpd ((__v2df)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4su)__A + (__v4su)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A + (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4su)__A - (__v4su)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A - (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hu)__A * (__v8hu)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
}

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128 (__m128i __A, const int __N)
{
  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128 (__m128i __A, const int __N)
{
  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128 (__m128i __A, const int __N)
{
  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128 (__m128i __A, const int __N)
{
  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
}
#else
#define _mm_bsrli_si128(A, N) \
  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
#define _mm_bslli_si128(A, N) \
  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
#define _mm_srli_si128(A, N) \
  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
#define _mm_slli_si128(A, N) \
  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64 (__m128i __A, int __B)
{
  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A & (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A | (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du)__A ^ (__v2du)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qi)__A == (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hi)__A == (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4si)__A == (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qs)__A < (__v16qs)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hi)__A < (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4si)__A < (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v16qs)__A > (__v16qs)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v8hi)__A > (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v4si)__A > (__v4si)__B);
}

#ifdef __OPTIMIZE__
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16 (__m128i const __A, int const __N)
{
  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
{
  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
}
#else
#define _mm_extract_epi16(A, N) \
  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_insert_epi16(A, D, N)				\
  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
					  (int)(D), (int)(N)))
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
}

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16 (__m128i __A, const int __mask)
{
  return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16 (__m128i __A, const int __mask)
{
  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32 (__m128i __A, const int __mask)
{
  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
}
#else
#define _mm_shufflehi_epi16(A, N) \
  ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_shufflelo_epi16(A, N) \
  ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_shuffle_epi32(A, N) \
  ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
#endif

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
{
  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32 (int *__A, int __B)
{
  __builtin_ia32_movnti (__A, __B);
}

#ifdef __x86_64__
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64 (long long int *__A, long long int __B)
{
  __builtin_ia32_movnti64 (__A, __B);
}
#endif

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128 (__m128i *__A, __m128i __B)
{
  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd (double *__A, __m128d __B)
{
  __builtin_ia32_movntpd (__A, (__v2df)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush (void const *__A)
{
  __builtin_ia32_clflush (__A);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence (void)
{
  __builtin_ia32_lfence ();
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence (void)
{
  __builtin_ia32_mfence ();
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128 (int __A)
{
  return _mm_set_epi32 (0, 0, 0, __A);
}

#ifdef __x86_64__
/* Intel intrinsic.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128 (long long __A)
{
  return _mm_set_epi64x (0, __A);
}

/* Microsoft intrinsic.  */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128 (long long __A)
{
  return _mm_set_epi64x (0, __A);
}
#endif

/* Casts between various SP, DP, INT vector types.  Note that these do no
   conversion of values, they just change the type.  */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)
{
  return (__m128) __A;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)
{
  return (__m128i) __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)
{
  return (__m128d) __A;
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)
{
  return (__m128i) __A;
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)
{
  return (__m128) __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)
{
  return (__m128d) __A;
}

#ifdef __DISABLE_SSE2__
#undef __DISABLE_SSE2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE2__ */

#endif /* _EMMINTRIN_H_INCLUDED */
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
# error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _RTMINTRIN_H_INCLUDED
#define _RTMINTRIN_H_INCLUDED

#ifndef __RTM__
#pragma GCC push_options
#pragma GCC target("rtm")
#define __DISABLE_RTM__
#endif /* __RTM__ */

#define _XBEGIN_STARTED		(~0u)
#define _XABORT_EXPLICIT	(1 << 0)
#define _XABORT_RETRY		(1 << 1)
#define _XABORT_CONFLICT	(1 << 2)
#define _XABORT_CAPACITY	(1 << 3)
#define _XABORT_DEBUG		(1 << 4)
#define _XABORT_NESTED		(1 << 5)
#define _XABORT_CODE(x)		(((x) >> 24) & 0xFF)

/* Start an RTM code region.  Return _XBEGIN_STARTED on success and the
   abort condition otherwise.  */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xbegin (void)
{
  return __builtin_ia32_xbegin ();
}

/* Specify the end of an RTM code region.  If it corresponds to the
   outermost transaction, then attempts the transaction commit.  If the
   commit fails, then control is transferred to the outermost transaction
   fallback handler.  */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xend (void)
{
  __builtin_ia32_xend ();
}

/* Force an RTM abort condition. The control is transferred to the
   outermost transaction fallback handler with the abort condition IMM.  */
#ifdef __OPTIMIZE__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xabort (const unsigned int __imm)
{
  __builtin_ia32_xabort (__imm);
}
#else
#define _xabort(N)  __builtin_ia32_xabort (N)
#endif /* __OPTIMIZE__ */

#ifdef __DISABLE_RTM__
#undef __DISABLE_RTM__
#pragma GCC pop_options
#endif /* __DISABLE_RTM__ */

#endif /* _RTMINTRIN_H_INCLUDED */
/* Copyright (C) 2014-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED
# error "Never use <xsavesintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _XSAVESINTRIN_H_INCLUDED
#define _XSAVESINTRIN_H_INCLUDED

#ifndef __XSAVES__
#pragma GCC push_options
#pragma GCC target("xsaves")
#define __DISABLE_XSAVES__
#endif /* __XSAVES__ */

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsaves (void *__P, long long __M)
{
  __builtin_ia32_xsaves (__P, __M);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xrstors (void *__P, long long __M)
{
  __builtin_ia32_xrstors (__P, __M);
}

#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xrstors64 (void *__P, long long __M)
{
  __builtin_ia32_xrstors64 (__P, __M);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsaves64 (void *__P, long long __M)
{
  __builtin_ia32_xsaves64 (__P, __M);
}
#endif

#ifdef __DISABLE_XSAVES__
#undef __DISABLE_XSAVES__
#pragma GCC pop_options
#endif /* __DISABLE_XSAVES__ */

#endif /* _XSAVESINTRIN_H_INCLUDED */
/* Copyright (C) 2015-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _IMMINTRIN_H_INCLUDED
# error "Never use <avx5124vnniwintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED
#define _AVX5124VNNIWINTRIN_H_INCLUDED

#ifndef __AVX5124VNNIW__
#pragma GCC push_options
#pragma GCC target("avx5124vnniw")
#define __DISABLE_AVX5124VNNIW__
#endif /* __AVX5124VNNIW__ */

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C,
		      __m512i __D, __m512i __E, __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B,
					     (__v16si) __C,
					     (__v16si) __D,
					     (__v16si) __E,
					     (__v16si) __A,
					     (const __v4si *) __F);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
			   __m512i __C, __m512i __D, __m512i __E,
			   __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
						  (__v16si) __C,
						  (__v16si) __D,
						  (__v16si) __E,
						  (__v16si) __A,
						  (const __v4si *) __F,
						  (__v16si) __A,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
			    __m512i __C, __m512i __D, __m512i __E,
			    __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
						  (__v16si) __C,
						  (__v16si) __D,
						  (__v16si) __E,
						  (__v16si) __A,
						  (const __v4si *) __F,
						  (__v16si) _mm512_setzero_ps (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C,
		       __m512i __D, __m512i __E, __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B,
					      (__v16si) __C,
					      (__v16si) __D,
					      (__v16si) __E,
					      (__v16si) __A,
					      (const __v4si *) __F);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
			    __m512i __C, __m512i __D, __m512i __E,
			    __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
						   (__v16si) __C,
						   (__v16si) __D,
						   (__v16si) __E,
						   (__v16si) __A,
						   (const __v4si *) __F,
						   (__v16si) __A,
						   (__mmask16) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
			     __m512i __C, __m512i __D, __m512i __E,
			     __m128i *__F)
{
  return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
						   (__v16si) __C,
						   (__v16si) __D,
						   (__v16si) __E,
						   (__v16si) __A,
						   (const __v4si *) __F,
						   (__v16si) _mm512_setzero_ps (),
						   (__mmask16) __U);
}

#ifdef __DISABLE_AVX5124VNNIW__
#undef __DISABLE_AVX5124VNNIW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX5124VNNIW__ */

#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vbmi2vlintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED
#define _AVX512VBMI2VLINTRIN_H_INCLUDED

#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2,avx512vl")
#define __DISABLE_AVX512VBMI2VL__
#endif /* __AVX512VBMIVL__ */

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compress_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi)__C,
						(__v16qi)__A, (__mmask16)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B,
			(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
}


extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compressstoreu_epi16 (void * __A, __mmask16 __B, __m256i __C)
{
  __builtin_ia32_compressstoreuhi256_mask ((__v16hi *) __A, (__v16hi) __C,
							(__mmask16) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compress_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi)__C, (__v8hi)__A,
								(__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B,
				(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compress_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi)__C,
						(__v16hi)__A, (__mmask16)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B,
			(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compressstoreu_epi8 (void * __A, __mmask16 __B, __m128i __C)
{
  __builtin_ia32_compressstoreuqi128_mask ((__v16qi *) __A, (__v16qi) __C,
							(__mmask16) __B);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compressstoreu_epi16 (void * __A, __mmask8 __B, __m128i __C)
{
  __builtin_ia32_compressstoreuhi128_mask ((__v8hi *) __A, (__v8hi) __C,
							(__mmask8) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expand_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __C,
						    (__v16qi) __A,
						    (__mmask16) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B,
			(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expandloadu_epi8 (__m128i __A, __mmask16 __B, const void * __C)
{
  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *) __C,
					(__v16qi) __A, (__mmask16) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B)
{
  return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B,
			(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expand_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __C,
						    (__v8hi) __A,
						    (__mmask8) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B,
				(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expandloadu_epi16 (__m128i __A, __mmask8 __B, const void * __C)
{
  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *) __C,
						(__v8hi) __A, (__mmask8) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B)
{
  return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B,
				(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expand_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __C,
						    (__v16hi) __A,
						    (__mmask16) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B,
			(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expandloadu_epi16 (__m256i __A, __mmask16 __B, const void * __C)
{
  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *) __C,
					(__v16hi) __A, (__mmask16) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B)
{
  return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B,
			(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
}

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B,
									__C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C,
			(__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B,
	(__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D,
					__E, (__v8si) __A, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C,
			__D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D,
					__E, (__v4di) __A, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C,
			__D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
					__E, (__v8hi) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
			__D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D,
					__E, (__v4si) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C,
			__D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D,
					__E, (__v2di) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C,
			__D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B,
									__C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C,
			(__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B,
	(__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D,
					__E, (__v8si) __A, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C,
			__D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
								int __E)
{
  return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D,
					__E, (__v4di) __A, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
{
  return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C,
			__D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C)
{
  return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
					__E, (__v8hi) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
			__D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D,
					__E, (__v4si) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C,
			__D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
								int __E)
{
  return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D,
					__E, (__v2di) __A, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
{
  return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C,
			__D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C)
{
  return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C);
}
#else
#define _mm256_shrdi_epi16(A, B, C) \
  ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \
					  (__v16hi)(__m256i)(B),(int)(C)))
#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \
					       (__v16hi)(__m256i)(D), \
					       (int)(E),		\
					       (__v16hi)(__m256i)(A), \
					       (__mmask16)(B)))
#define _mm256_maskz_shrdi_epi16(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B),		\
				     (__v16hi)(__m256i)(C),(int)(D),	\
				     (__v16hi)(__m256i)_mm256_setzero_si256 (), \
				     (__mmask16)(A)))
#define _mm256_shrdi_epi32(A, B, C) \
  ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \
					 (__v8si)(__m256i)(B),(int)(C)))
#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \
					      (__v8si)(__m256i)(D), \
					      (int)(E), \
					      (__v8si)(__m256i)(A), \
					      (__mmask8)(B)))
#define _mm256_maskz_shrdi_epi32(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B),		\
				    (__v8si)(__m256i)(C),(int)(D),	\
				    (__v8si)(__m256i)_mm256_setzero_si256 (), \
				    (__mmask8)(A)))
#define _mm256_shrdi_epi64(A, B, C) \
  ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \
					 (__v4di)(__m256i)(B),(int)(C)))
#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \
					      (__v4di)(__m256i)(D), (int)(E), \
					      (__v4di)(__m256i)(A), \
					      (__mmask8)(B)))
#define _mm256_maskz_shrdi_epi64(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B),		\
				    (__v4di)(__m256i)(C),(int)(D),	\
				    (__v4di)(__m256i)_mm256_setzero_si256 (), \
				    (__mmask8)(A)))
#define _mm_shrdi_epi16(A, B, C) \
  ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \
					 (__v8hi)(__m128i)(B),(int)(C)))
#define _mm_mask_shrdi_epi16(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \
					      (__v8hi)(__m128i)(D), (int)(E), \
					      (__v8hi)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shrdi_epi16(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B),		\
				    (__v8hi)(__m128i)(C),(int)(D),	\
				    (__v8hi)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#define _mm_shrdi_epi32(A, B, C) \
  ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \
					 (__v4si)(__m128i)(B),(int)(C)))
#define _mm_mask_shrdi_epi32(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C),	\
					      (__v4si)(__m128i)(D), (int)(E), \
					      (__v4si)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shrdi_epi32(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B),		\
				    (__v4si)(__m128i)(C),(int)(D),	\
				    (__v4si)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#define _mm_shrdi_epi64(A, B, C) \
  ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \
					 (__v2di)(__m128i)(B),(int)(C)))
#define _mm_mask_shrdi_epi64(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \
					      (__v2di)(__m128i)(D), (int)(E), \
					      (__v2di)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shrdi_epi64(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B),		\
				    (__v2di)(__m128i)(C),(int)(D),	\
				    (__v2di)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#define _mm256_shldi_epi16(A, B, C) \
  ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \
					  (__v16hi)(__m256i)(B),(int)(C)))
#define _mm256_mask_shldi_epi16(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \
					       (__v16hi)(__m256i)(D), \
					       (int)(E),		\
					       (__v16hi)(__m256i)(A), \
					       (__mmask16)(B)))
#define _mm256_maskz_shldi_epi16(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B),		\
				     (__v16hi)(__m256i)(C),(int)(D),	\
				     (__v16hi)(__m256i)_mm256_setzero_si256 (), \
				     (__mmask16)(A)))
#define _mm256_shldi_epi32(A, B, C) \
  ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \
					 (__v8si)(__m256i)(B),(int)(C)))
#define _mm256_mask_shldi_epi32(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \
					      (__v8si)(__m256i)(D), (int)(E), \
					      (__v8si)(__m256i)(A), \
					      (__mmask8)(B)))
#define _mm256_maskz_shldi_epi32(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B),		\
				    (__v8si)(__m256i)(C),(int)(D),	\
				    (__v8si)(__m256i)_mm256_setzero_si256 (), \
				    (__mmask8)(A)))
#define _mm256_shldi_epi64(A, B, C) \
  ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \
					 (__v4di)(__m256i)(B),(int)(C)))
#define _mm256_mask_shldi_epi64(A, B, C, D, E) \
  ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \
					      (__v4di)(__m256i)(D), (int)(E), \
					      (__v4di)(__m256i)(A), \
					      (__mmask8)(B)))
#define _mm256_maskz_shldi_epi64(A, B, C, D) \
  ((__m256i) \
   __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B),		\
				    (__v4di)(__m256i)(C),(int)(D),	\
				    (__v4di)(__m256i)_mm256_setzero_si256 (), \
				    (__mmask8)(A)))
#define _mm_shldi_epi16(A, B, C) \
  ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \
					 (__v8hi)(__m128i)(B),(int)(C)))
#define _mm_mask_shldi_epi16(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \
					      (__v8hi)(__m128i)(D), (int)(E), \
					      (__v8hi)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shldi_epi16(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B),		\
				    (__v8hi)(__m128i)(C),(int)(D),	\
				    (__v8hi)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#define _mm_shldi_epi32(A, B, C) \
  ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \
					 (__v4si)(__m128i)(B),(int)(C)))
#define _mm_mask_shldi_epi32(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \
					      (__v4si)(__m128i)(D), (int)(E), \
					      (__v4si)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shldi_epi32(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B),		\
				    (__v4si)(__m128i)(C),(int)(D),	\
				    (__v4si)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#define _mm_shldi_epi64(A, B, C) \
  ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \
					 (__v2di)(__m128i)(B),(int)(C)))
#define _mm_mask_shldi_epi64(A, B, C, D, E) \
  ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \
					      (__v2di)(__m128i)(D), (int)(E), \
					      (__v2di)(__m128i)(A), \
					      (__mmask8)(B)))
#define _mm_maskz_shldi_epi64(A, B, C, D) \
  ((__m128i) \
   __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B),		\
				    (__v2di)(__m128i)(C),(int)(D),	\
				    (__v2di)(__m128i)_mm_setzero_si128 (), \
				    (__mmask8)(A)))
#endif

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshrdv_v16hi ((__v16hi)__A, (__v16hi) __B,
								(__v16hi) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask ((__v16hi)__A,
				(__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz ((__v16hi)__B,
				(__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshrdv_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v8si_mask ((__v8si)__A, (__v8si) __C,
						(__v8si) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz ((__v8si)__B, (__v8si) __C,
						 (__v8si) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi64 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshrdv_v4di ((__v4di)__A, (__v4di) __B,
								(__v4di) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v4di_mask ((__v4di)__A, (__v4di) __C,
						(__v4di) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz ((__v4di)__B, (__v4di) __C,
						 (__v4di) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi16 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshrdv_v8hi ((__v8hi)__A, (__v8hi) __B,
								(__v8hi) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
						(__v8hi) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
						 (__v8hi) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshrdv_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v4si_mask ((__v4si)__A, (__v4si) __C,
						(__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz ((__v4si)__B, (__v4si) __C,
						 (__v4si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi64 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshrdv_v2di ((__v2di)__A, (__v2di) __B,
								(__v2di) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v2di_mask ((__v2di)__A, (__v2di) __C,
						(__v2di) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz ((__v2di)__B, (__v2di) __C,
						 (__v2di) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi16 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshldv_v16hi ((__v16hi)__A, (__v16hi) __B,
								(__v16hi) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v16hi_mask ((__v16hi)__A,
				(__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz ((__v16hi)__B,
				(__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshldv_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v8si_mask ((__v8si)__A, (__v8si) __C,
						(__v8si) __D, (__mmask8)__B) ;
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v8si_maskz ((__v8si)__B, (__v8si) __C,
						(__v8si) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi64 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpshldv_v4di ((__v4di)__A, (__v4di) __B,
								(__v4di) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v4di_mask ((__v4di)__A, (__v4di) __C,
						(__v4di) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpshldv_v4di_maskz ((__v4di)__B, (__v4di) __C,
						 (__v4di) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi16 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshldv_v8hi ((__v8hi)__A, (__v8hi) __B,
								(__v8hi) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
						(__v8hi) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
						 (__v8hi) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshldv_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v4si_mask ((__v4si)__A, (__v4si) __C,
						(__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v4si_maskz ((__v4si)__B, (__v4si) __C,
						 (__v4si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi64 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpshldv_v2di ((__v2di)__A, (__v2di) __B,
								(__v2di) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v2di_mask ((__v2di)__A, (__v2di) __C,
						(__v2di) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C,
						(__v2di) __D, (__mmask8)__A);
}




#ifdef __DISABLE_AVX512VBMI2VL__
#undef __DISABLE_AVX512VBMI2VL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMIVL__ */

#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \
    !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
#define __DISABLE_AVX512VBMI2VLBW__
#endif /* __AVX512VBMIVLBW__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi)__C,
						(__v32qi)__A, (__mmask32)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B,
			(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compressstoreu_epi8 (void * __A, __mmask32 __B, __m256i __C)
{
  __builtin_ia32_compressstoreuqi256_mask ((__v32qi *) __A, (__v32qi) __C,
							(__mmask32) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expand_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __C,
						    (__v32qi) __A,
						    (__mmask32) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B,
			(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expandloadu_epi8 (__m256i __A, __mmask32 __B, const void * __C)
{
  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *) __C,
					(__v32qi) __A, (__mmask32) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B)
{
  return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B,
			(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
}

#ifdef __DISABLE_AVX512VBMI2VLBW__
#undef __DISABLE_AVX512VBMI2VLBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMIVLBW__ */

#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
/* Copyright (C) 2008-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 11.0.  */

#ifndef _IMMINTRIN_H_INCLUDED
# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVXINTRIN_H_INCLUDED
#define _AVXINTRIN_H_INCLUDED

#ifndef __AVX__
#pragma GCC push_options
#pragma GCC target("avx")
#define __DISABLE_AVX__
#endif /* __AVX__ */

/* Internal data types for implementing the intrinsics.  */
typedef double __v4df __attribute__ ((__vector_size__ (32)));
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
typedef int __v8si __attribute__ ((__vector_size__ (32)));
typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef float __m256 __attribute__ ((__vector_size__ (32),
				     __may_alias__));
typedef long long __m256i __attribute__ ((__vector_size__ (32),
					  __may_alias__));
typedef double __m256d __attribute__ ((__vector_size__ (32),
				       __may_alias__));

/* Unaligned version of the same types.  */
typedef float __m256_u __attribute__ ((__vector_size__ (32),
				       __may_alias__,
				       __aligned__ (1)));
typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
					    __may_alias__,
					    __aligned__ (1)));
typedef double __m256d_u __attribute__ ((__vector_size__ (32),
					 __may_alias__,
					 __aligned__ (1)));

/* Compare predicates for scalar and packed compare intrinsics.  */

/* Equal (ordered, non-signaling)  */
#define _CMP_EQ_OQ	0x00
/* Less-than (ordered, signaling)  */
#define _CMP_LT_OS	0x01
/* Less-than-or-equal (ordered, signaling)  */
#define _CMP_LE_OS	0x02
/* Unordered (non-signaling)  */
#define _CMP_UNORD_Q	0x03
/* Not-equal (unordered, non-signaling)  */
#define _CMP_NEQ_UQ	0x04
/* Not-less-than (unordered, signaling)  */
#define _CMP_NLT_US	0x05
/* Not-less-than-or-equal (unordered, signaling)  */
#define _CMP_NLE_US	0x06
/* Ordered (nonsignaling)   */
#define _CMP_ORD_Q	0x07
/* Equal (unordered, non-signaling)  */
#define _CMP_EQ_UQ	0x08
/* Not-greater-than-or-equal (unordered, signaling)  */
#define _CMP_NGE_US	0x09
/* Not-greater-than (unordered, signaling)  */
#define _CMP_NGT_US	0x0a
/* False (ordered, non-signaling)  */
#define _CMP_FALSE_OQ	0x0b
/* Not-equal (ordered, non-signaling)  */
#define _CMP_NEQ_OQ	0x0c
/* Greater-than-or-equal (ordered, signaling)  */
#define _CMP_GE_OS	0x0d
/* Greater-than (ordered, signaling)  */
#define _CMP_GT_OS	0x0e
/* True (unordered, non-signaling)  */
#define _CMP_TRUE_UQ	0x0f
/* Equal (ordered, signaling)  */
#define _CMP_EQ_OS	0x10
/* Less-than (ordered, non-signaling)  */
#define _CMP_LT_OQ	0x11
/* Less-than-or-equal (ordered, non-signaling)  */
#define _CMP_LE_OQ	0x12
/* Unordered (signaling)  */
#define _CMP_UNORD_S	0x13
/* Not-equal (unordered, signaling)  */
#define _CMP_NEQ_US	0x14
/* Not-less-than (unordered, non-signaling)  */
#define _CMP_NLT_UQ	0x15
/* Not-less-than-or-equal (unordered, non-signaling)  */
#define _CMP_NLE_UQ	0x16
/* Ordered (signaling)  */
#define _CMP_ORD_S	0x17
/* Equal (unordered, signaling)  */
#define _CMP_EQ_US	0x18
/* Not-greater-than-or-equal (unordered, non-signaling)  */
#define _CMP_NGE_UQ	0x19
/* Not-greater-than (unordered, non-signaling)  */
#define _CMP_NGT_UQ	0x1a
/* False (ordered, signaling)  */
#define _CMP_FALSE_OS	0x1b
/* Not-equal (ordered, signaling)  */
#define _CMP_NEQ_OS	0x1c
/* Greater-than-or-equal (ordered, non-signaling)  */
#define _CMP_GE_OQ	0x1d
/* Greater-than (ordered, non-signaling)  */
#define _CMP_GT_OQ	0x1e
/* True (unordered, signaling)  */
#define _CMP_TRUE_US	0x1f

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_pd (__m256d __A, __m256d __B)
{
  return (__m256d) ((__v4df)__A + (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_ps (__m256 __A, __m256 __B)
{
  return (__m256) ((__v8sf)__A + (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_addsub_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_addsub_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
}


extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_and_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_and_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_andnot_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_andnot_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
}

/* Double/single precision floating point blend instructions - select
   data from 2 sources using constant/variable mask.  */

#ifdef __OPTIMIZE__
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
{
  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
					      (__v4df)__Y,
					      __M);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
{
  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
					     (__v8sf)__Y,
					     __M);
}
#else
#define _mm256_blend_pd(X, Y, M)					\
  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
					(__v4df)(__m256d)(Y), (int)(M)))

#define _mm256_blend_ps(X, Y, M)					\
  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
				       (__v8sf)(__m256)(Y), (int)(M)))
#endif

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
{
  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
					       (__v4df)__Y,
					       (__v4df)__M);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
{
  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
					      (__v8sf)__Y,
					      (__v8sf)__M);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_pd (__m256d __A, __m256d __B)
{
  return (__m256d) ((__v4df)__A / (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_ps (__m256 __A, __m256 __B)
{
  return (__m256) ((__v8sf)__A / (__v8sf)__B);
}

/* Dot product instructions with mask-defined summing and zeroing parts
   of result.  */

#ifdef __OPTIMIZE__
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
{
  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
					  (__v8sf)__Y,
					  __M);
}
#else
#define _mm256_dp_ps(X, Y, M)						\
  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
				    (__v8sf)(__m256)(Y), (int)(M)))
#endif

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hadd_pd (__m256d __X, __m256d __Y)
{
  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hadd_ps (__m256 __X, __m256 __Y)
{
  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hsub_pd (__m256d __X, __m256d __Y)
{
  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hsub_ps (__m256 __X, __m256 __Y)
{
  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_pd (__m256d __A, __m256d __B)
{
  return (__m256d) ((__v4df)__A * (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_ps (__m256 __A, __m256 __B)
{
  return (__m256) ((__v8sf)__A * (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
}

#ifdef __OPTIMIZE__
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
{
  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
					     __mask);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
{
  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
					    __mask);
}
#else
#define _mm256_shuffle_pd(A, B, N)					\
  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
				      (__v4df)(__m256d)(B), (int)(N)))

#define _mm256_shuffle_ps(A, B, N)					\
  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
				      (__v8sf)(__m256)(B), (int)(N)))
#endif

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_pd (__m256d __A, __m256d __B)
{
  return (__m256d) ((__v4df)__A - (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_ps (__m256 __A, __m256 __B)
{
  return (__m256) ((__v8sf)__A - (__v8sf)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
}

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
{
  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
{
  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
{
  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
					    __P);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
{
  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
					   __P);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
{
  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
{
  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
}
#else
#define _mm_cmp_pd(X, Y, P)						\
  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
				   (__v2df)(__m128d)(Y), (int)(P)))

#define _mm_cmp_ps(X, Y, P)						\
  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
				  (__v4sf)(__m128)(Y), (int)(P)))

#define _mm256_cmp_pd(X, Y, P)						\
  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
				      (__v4df)(__m256d)(Y), (int)(P)))

#define _mm256_cmp_ps(X, Y, P)						\
  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
				     (__v8sf)(__m256)(Y), (int)(P)))

#define _mm_cmp_sd(X, Y, P)						\
  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
				   (__v2df)(__m128d)(Y), (int)(P)))

#define _mm_cmp_ss(X, Y, P)						\
  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
				  (__v4sf)(__m128)(Y), (int)(P)))
#endif

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi32_pd (__m128i __A)
{
  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi32_ps (__m256i __A)
{
  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtpd_ps (__m256d __A)
{
  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_epi32 (__m256 __A)
{
  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_pd (__m128 __A)
{
  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttpd_epi32 (__m256d __A)
{
  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtpd_epi32 (__m256d __A)
{
  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttps_epi32 (__m256 __A)
{
  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtsd_f64 (__m256d __A)
{
  return __A[0];
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtss_f32 (__m256 __A)
{
  return __A[0];
}

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extractf128_pd (__m256d __X, const int __N)
{
  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extractf128_ps (__m256 __X, const int __N)
{
  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extractf128_si256 (__m256i __X, const int __N)
{
  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extract_epi32 (__m256i __X, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
  return _mm_extract_epi32 (__Y, __N % 4);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extract_epi16 (__m256i __X, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
  return _mm_extract_epi16 (__Y, __N % 8);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extract_epi8 (__m256i __X, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
  return _mm_extract_epi8 (__Y, __N % 16);
}

#ifdef __x86_64__
extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extract_epi64 (__m256i __X, const int __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
  return _mm_extract_epi64 (__Y, __N % 2);
}
#endif
#else
#define _mm256_extractf128_pd(X, N)					\
  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
						(int)(N)))

#define _mm256_extractf128_ps(X, N)					\
  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
					       (int)(N)))

#define _mm256_extractf128_si256(X, N)					\
  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
						(int)(N)))

#define _mm256_extract_epi32(X, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
      _mm_extract_epi32 (__Y, (N) % 4);					\
    }))

#define _mm256_extract_epi16(X, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
      _mm_extract_epi16 (__Y, (N) % 8);					\
    }))

#define _mm256_extract_epi8(X, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
      _mm_extract_epi8 (__Y, (N) % 16);					\
    }))

#ifdef __x86_64__
#define _mm256_extract_epi64(X, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
      _mm_extract_epi64 (__Y, (N) % 2);					\
    }))
#endif
#endif

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_zeroall (void)
{
  __builtin_ia32_vzeroall ();
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_zeroupper (void)
{
  __builtin_ia32_vzeroupper ();
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutevar_pd (__m128d __A, __m128i __C)
{
  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
						(__v2di)__C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar_pd (__m256d __A, __m256i __C)
{
  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
						   (__v4di)__C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutevar_ps (__m128 __A, __m128i __C)
{
  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
					       (__v4si)__C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar_ps (__m256 __A, __m256i __C)
{
  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
						  (__v8si)__C);
}

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute_pd (__m128d __X, const int __C)
{
  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute_pd (__m256d __X, const int __C)
{
  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute_ps (__m128 __X, const int __C)
{
  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute_ps (__m256 __X, const int __C)
{
  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
}
#else
#define _mm_permute_pd(X, C)						\
  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))

#define _mm256_permute_pd(X, C)						\
  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))

#define _mm_permute_ps(X, C)						\
  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))

#define _mm256_permute_ps(X, C)						\
  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
#endif

#ifdef __OPTIMIZE__
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
{
  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
						    (__v4df)__Y,
						    __C);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
{
  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
						   (__v8sf)__Y,
						   __C);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
{
  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
						    (__v8si)__Y,
						    __C);
}
#else
#define _mm256_permute2f128_pd(X, Y, C)					\
  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
					      (__v4df)(__m256d)(Y),	\
					      (int)(C)))

#define _mm256_permute2f128_ps(X, Y, C)					\
  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
					     (__v8sf)(__m256)(Y),	\
					     (int)(C)))

#define _mm256_permute2f128_si256(X, Y, C)				\
  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
					      (__v8si)(__m256i)(Y),	\
					      (int)(C)))
#endif

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcast_ss (float const *__X)
{
  return (__m128) __builtin_ia32_vbroadcastss (__X);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_sd (double const *__X)
{
  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_ss (float const *__X)
{
  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_pd (__m128d const *__X)
{
  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_ps (__m128 const *__X)
{
  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
}

#ifdef __OPTIMIZE__
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
{
  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
						     (__v2df)__Y,
						     __O);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
{
  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
						    (__v4sf)__Y,
						    __O);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
{
  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
						     (__v4si)__Y,
						     __O);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
}

#ifdef __x86_64__
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
{
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
}
#endif
#else
#define _mm256_insertf128_pd(X, Y, O)					\
  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
					       (__v2df)(__m128d)(Y),	\
					       (int)(O)))

#define _mm256_insertf128_ps(X, Y, O)					\
  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
					      (__v4sf)(__m128)(Y),  	\
					      (int)(O)))

#define _mm256_insertf128_si256(X, Y, O)				\
  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
					       (__v4si)(__m128i)(Y),	\
					       (int)(O)))

#define _mm256_insert_epi32(X, D, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
    }))

#define _mm256_insert_epi16(X, D, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
    }))

#define _mm256_insert_epi8(X, D, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
    }))

#ifdef __x86_64__
#define _mm256_insert_epi64(X, D, N)					\
  (__extension__							\
   ({									\
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
    }))
#endif
#endif

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_load_pd (double const *__P)
{
  return *(__m256d *)__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_store_pd (double *__P, __m256d __A)
{
  *(__m256d *)__P = __A;
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_load_ps (float const *__P)
{
  return *(__m256 *)__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_store_ps (float *__P, __m256 __A)
{
  *(__m256 *)__P = __A;
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_loadu_pd (double const *__P)
{
  return *(__m256d_u *)__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_storeu_pd (double *__P, __m256d __A)
{
  *(__m256d_u *)__P = __A;
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_loadu_ps (float const *__P)
{
  return *(__m256_u *)__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_storeu_ps (float *__P, __m256 __A)
{
  *(__m256_u *)__P = __A;
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_load_si256 (__m256i const *__P)
{
  return *__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_store_si256 (__m256i *__P, __m256i __A)
{
  *__P = __A;
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_loadu_si256 (__m256i_u const *__P)
{
  return *__P;
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
{
  *__P = __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_pd (double const *__P, __m128i __M)
{
  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
					      (__v2di)__M);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
{
  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskload_pd (double const *__P, __m256i __M)
{
  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
						 (__v4di)__M);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
{
  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_ps (float const *__P, __m128i __M)
{
  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
					     (__v4si)__M);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
{
  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskload_ps (float const *__P, __m256i __M)
{
  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
						(__v8si)__M);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
{
  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movehdup_ps (__m256 __X)
{
  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_moveldup_ps (__m256 __X)
{
  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movedup_pd (__m256d __X)
{
  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_lddqu_si256 (__m256i const *__P)
{
  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_stream_si256 (__m256i *__A, __m256i __B)
{
  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_stream_pd (double *__A, __m256d __B)
{
  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_stream_ps (float *__P, __m256 __A)
{
  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_rcp_ps (__m256 __A)
{
  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_rsqrt_ps (__m256 __A)
{
  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sqrt_pd (__m256d __A)
{
  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sqrt_ps (__m256 __A)
{
  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
}

#ifdef __OPTIMIZE__
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_round_pd (__m256d __V, const int __M)
{
  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_round_ps (__m256 __V, const int __M)
{
  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
}
#else
#define _mm256_round_pd(V, M) \
  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))

#define _mm256_round_ps(V, M) \
  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
#endif

#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_pd (__m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_ps (__m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_pd (__m128d __M, __m128d __V)
{
  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_pd (__m128d __M, __m128d __V)
{
  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_pd (__m128d __M, __m128d __V)
{
  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_ps (__m128 __M, __m128 __V)
{
  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_ps (__m128 __M, __m128 __V)
{
  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_ps (__m128 __M, __m128 __V)
{
  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testz_pd (__m256d __M, __m256d __V)
{
  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testc_pd (__m256d __M, __m256d __V)
{
  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testnzc_pd (__m256d __M, __m256d __V)
{
  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testz_ps (__m256 __M, __m256 __V)
{
  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testc_ps (__m256 __M, __m256 __V)
{
  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testnzc_ps (__m256 __M, __m256 __V)
{
  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testz_si256 (__m256i __M, __m256i __V)
{
  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testc_si256 (__m256i __M, __m256i __V)
{
  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testnzc_si256 (__m256i __M, __m256i __V)
{
  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movemask_pd (__m256d __A)
{
  return __builtin_ia32_movmskpd256 ((__v4df)__A);
}

extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movemask_ps (__m256 __A)
{
  return __builtin_ia32_movmskps256 ((__v8sf)__A);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_undefined_pd (void)
{
  __m256d __Y = __Y;
  return __Y;
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_undefined_ps (void)
{
  __m256 __Y = __Y;
  return __Y;
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_undefined_si256 (void)
{
  __m256i __Y = __Y;
  return __Y;
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setzero_pd (void)
{
  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setzero_ps (void)
{
  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
				 0.0, 0.0, 0.0, 0.0 };
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setzero_si256 (void)
{
  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
}

/* Create the vector [A B C D].  */
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_pd (double __A, double __B, double __C, double __D)
{
  return __extension__ (__m256d){ __D, __C, __B, __A };
}

/* Create the vector [A B C D E F G H].  */
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_ps (float __A, float __B, float __C, float __D,
	       float __E, float __F, float __G, float __H)
{
  return __extension__ (__m256){ __H, __G, __F, __E,
				 __D, __C, __B, __A };
}

/* Create the vector [A B C D E F G H].  */
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_epi32 (int __A, int __B, int __C, int __D,
		  int __E, int __F, int __G, int __H)
{
  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
					  __D, __C, __B, __A };
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
		  short __q11, short __q10, short __q09, short __q08,
		  short __q07, short __q06, short __q05, short __q04,
		  short __q03, short __q02, short __q01, short __q00)
{
  return __extension__ (__m256i)(__v16hi){
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
  };
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
		  char __q27, char __q26, char __q25, char __q24,
		  char __q23, char __q22, char __q21, char __q20,
		  char __q19, char __q18, char __q17, char __q16,
		  char __q15, char __q14, char __q13, char __q12,
		  char __q11, char __q10, char __q09, char __q08,
		  char __q07, char __q06, char __q05, char __q04,
		  char __q03, char __q02, char __q01, char __q00)
{
  return __extension__ (__m256i)(__v32qi){
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
  };
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_epi64x (long long __A, long long __B, long long __C,
		   long long __D)
{
  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
}

/* Create a vector with all elements equal to A.  */
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_pd (double __A)
{
  return __extension__ (__m256d){ __A, __A, __A, __A };
}

/* Create a vector with all elements equal to A.  */
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_ps (float __A)
{
  return __extension__ (__m256){ __A, __A, __A, __A,
				 __A, __A, __A, __A };
}

/* Create a vector with all elements equal to A.  */
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_epi32 (int __A)
{
  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
					  __A, __A, __A, __A };
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_epi16 (short __A)
{
  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
			   __A, __A, __A, __A, __A, __A, __A, __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_epi8 (char __A)
{
  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
			  __A, __A, __A, __A, __A, __A, __A, __A,
			  __A, __A, __A, __A, __A, __A, __A, __A,
			  __A, __A, __A, __A, __A, __A, __A, __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_epi64x (long long __A)
{
  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
}

/* Create vectors of elements in the reversed order from the
   _mm256_set_XXX functions.  */

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_pd (double __A, double __B, double __C, double __D)
{
  return _mm256_set_pd (__D, __C, __B, __A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_ps (float __A, float __B, float __C, float __D,
		float __E, float __F, float __G, float __H)
{
  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
		   int __E, int __F, int __G, int __H)
{
  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
		   short __q11, short __q10, short __q09, short __q08,
		   short __q07, short __q06, short __q05, short __q04,
		   short __q03, short __q02, short __q01, short __q00)
{
  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
			   __q04, __q05, __q06, __q07,
			   __q08, __q09, __q10, __q11,
			   __q12, __q13, __q14, __q15);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
		   char __q27, char __q26, char __q25, char __q24,
		   char __q23, char __q22, char __q21, char __q20,
		   char __q19, char __q18, char __q17, char __q16,
		   char __q15, char __q14, char __q13, char __q12,
		   char __q11, char __q10, char __q09, char __q08,
		   char __q07, char __q06, char __q05, char __q04,
		   char __q03, char __q02, char __q01, char __q00)
{
  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
			  __q04, __q05, __q06, __q07,
			  __q08, __q09, __q10, __q11,
			  __q12, __q13, __q14, __q15,
			  __q16, __q17, __q18, __q19,
			  __q20, __q21, __q22, __q23,
			  __q24, __q25, __q26, __q27,
			  __q28, __q29, __q30, __q31);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_epi64x (long long __A, long long __B, long long __C,
		    long long __D)
{
  return _mm256_set_epi64x (__D, __C, __B, __A);
}

/* Casts between various SP, DP, INT vector types.  Note that these do no
   conversion of values, they just change the type.  */
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castpd_ps (__m256d __A)
{
  return (__m256) __A;
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castpd_si256 (__m256d __A)
{
  return (__m256i) __A;
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castps_pd (__m256 __A)
{
  return (__m256d) __A;
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castps_si256(__m256 __A)
{
  return (__m256i) __A;
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castsi256_ps (__m256i __A)
{
  return (__m256) __A;
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castsi256_pd (__m256i __A)
{
  return (__m256d) __A;
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castpd256_pd128 (__m256d __A)
{
  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castps256_ps128 (__m256 __A)
{
  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castsi256_si128 (__m256i __A)
{
  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
}

/* When cast is done from a 128 to 256-bit type, the low 128 bits of
   the 256-bit result contain source parameter value and the upper 128
   bits of the result are undefined.  Those intrinsics shouldn't
   generate any extra moves.  */

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castpd128_pd256 (__m128d __A)
{
  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castps128_ps256 (__m128 __A)
{
  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castsi128_si256 (__m128i __A)
{
  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_m128 ( __m128 __H, __m128 __L)
{
  return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_m128d (__m128d __H, __m128d __L)
{
  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set_m128i (__m128i __H, __m128i __L)
{
  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_m128 (__m128 __L, __m128 __H)
{
  return _mm256_set_m128 (__H, __L);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_m128d (__m128d __L, __m128d __H)
{
  return _mm256_set_m128d (__H, __L);
}

extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_setr_m128i (__m128i __L, __m128i __H)
{
  return _mm256_set_m128i (__H, __L);
}

#ifdef __DISABLE_AVX__
#undef __DISABLE_AVX__
#pragma GCC pop_options
#endif /* __DISABLE_AVX__ */

#endif /* _AVXINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512CDINTRIN_H_INCLUDED
#define _AVX512CDINTRIN_H_INCLUDED

#ifndef __AVX512CD__
#pragma GCC push_options
#pragma GCC target("avx512cd")
#define __DISABLE_AVX512CD__
#endif /* __AVX512CD__ */

/* Internal data types for implementing the intrinsics.  */
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
typedef int __v16si __attribute__ ((__vector_size__ (64)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));

typedef unsigned char  __mmask8;
typedef unsigned short __mmask16;

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_conflict_epi32 (__m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
					       (__v16si) _mm512_setzero_si512 (),
					       (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
							 (__v16si) __W,
							 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
					       (__v16si) _mm512_setzero_si512 (),
					       (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_conflict_epi64 (__m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
					       (__v8di) _mm512_setzero_si512 (),
					       (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
							 (__v8di) __W,
							 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
					       (__v8di) _mm512_setzero_si512 (),
					       (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_lzcnt_epi64 (__m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
					   (__v8di) _mm512_setzero_si512 (),
					   (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
						     (__v8di) __W,
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
					   (__v8di) _mm512_setzero_si512 (),
					   (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_lzcnt_epi32 (__m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
					   (__v16si) _mm512_setzero_si512 (),
					   (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
						     (__v16si) __W,
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i)
	 __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
					   (__v16si) _mm512_setzero_si512 (),
					   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
}

#ifdef __DISABLE_AVX512CD__
#undef __DISABLE_AVX512CD__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512CD__ */

#endif /* _AVX512CDINTRIN_H_INCLUDED */
/* Copyright (C) 2002-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
 * ISO C Standard:  5.2.4.2.2  Characteristics of floating types <float.h>
 */

#ifndef _FLOAT_H___
#define _FLOAT_H___

/* Radix of exponent representation, b. */
#undef FLT_RADIX
#define FLT_RADIX	__FLT_RADIX__

/* Number of base-FLT_RADIX digits in the significand, p.  */
#undef FLT_MANT_DIG
#undef DBL_MANT_DIG
#undef LDBL_MANT_DIG
#define FLT_MANT_DIG	__FLT_MANT_DIG__
#define DBL_MANT_DIG	__DBL_MANT_DIG__
#define LDBL_MANT_DIG	__LDBL_MANT_DIG__

/* Number of decimal digits, q, such that any floating-point number with q
   decimal digits can be rounded into a floating-point number with p radix b
   digits and back again without change to the q decimal digits,

	p * log10(b)			if b is a power of 10
	floor((p - 1) * log10(b))	otherwise
*/
#undef FLT_DIG
#undef DBL_DIG
#undef LDBL_DIG
#define FLT_DIG		__FLT_DIG__
#define DBL_DIG		__DBL_DIG__
#define LDBL_DIG	__LDBL_DIG__

/* Minimum int x such that FLT_RADIX**(x-1) is a normalized float, emin */
#undef FLT_MIN_EXP
#undef DBL_MIN_EXP
#undef LDBL_MIN_EXP
#define FLT_MIN_EXP	__FLT_MIN_EXP__
#define DBL_MIN_EXP	__DBL_MIN_EXP__
#define LDBL_MIN_EXP	__LDBL_MIN_EXP__

/* Minimum negative integer such that 10 raised to that power is in the
   range of normalized floating-point numbers,

	ceil(log10(b) * (emin - 1))
*/
#undef FLT_MIN_10_EXP
#undef DBL_MIN_10_EXP
#undef LDBL_MIN_10_EXP
#define FLT_MIN_10_EXP	__FLT_MIN_10_EXP__
#define DBL_MIN_10_EXP	__DBL_MIN_10_EXP__
#define LDBL_MIN_10_EXP	__LDBL_MIN_10_EXP__

/* Maximum int x such that FLT_RADIX**(x-1) is a representable float, emax.  */
#undef FLT_MAX_EXP
#undef DBL_MAX_EXP
#undef LDBL_MAX_EXP
#define FLT_MAX_EXP	__FLT_MAX_EXP__
#define DBL_MAX_EXP	__DBL_MAX_EXP__
#define LDBL_MAX_EXP	__LDBL_MAX_EXP__

/* Maximum integer such that 10 raised to that power is in the range of
   representable finite floating-point numbers,

	floor(log10((1 - b**-p) * b**emax))
*/
#undef FLT_MAX_10_EXP
#undef DBL_MAX_10_EXP
#undef LDBL_MAX_10_EXP
#define FLT_MAX_10_EXP	__FLT_MAX_10_EXP__
#define DBL_MAX_10_EXP	__DBL_MAX_10_EXP__
#define LDBL_MAX_10_EXP	__LDBL_MAX_10_EXP__

/* Maximum representable finite floating-point number,

	(1 - b**-p) * b**emax
*/
#undef FLT_MAX
#undef DBL_MAX
#undef LDBL_MAX
#define FLT_MAX		__FLT_MAX__
#define DBL_MAX		__DBL_MAX__
#define LDBL_MAX	__LDBL_MAX__

/* The difference between 1 and the least value greater than 1 that is
   representable in the given floating point type, b**1-p.  */
#undef FLT_EPSILON
#undef DBL_EPSILON
#undef LDBL_EPSILON
#define FLT_EPSILON	__FLT_EPSILON__
#define DBL_EPSILON	__DBL_EPSILON__
#define LDBL_EPSILON	__LDBL_EPSILON__

/* Minimum normalized positive floating-point number, b**(emin - 1).  */
#undef FLT_MIN
#undef DBL_MIN
#undef LDBL_MIN
#define FLT_MIN		__FLT_MIN__
#define DBL_MIN		__DBL_MIN__
#define LDBL_MIN	__LDBL_MIN__

/* Addition rounds to 0: zero, 1: nearest, 2: +inf, 3: -inf, -1: unknown.  */
/* ??? This is supposed to change with calls to fesetround in <fenv.h>.  */
#undef FLT_ROUNDS
#define FLT_ROUNDS 1

#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) \
     || (defined (__cplusplus) && __cplusplus >= 201103L)
/* The floating-point expression evaluation method.  The precise
   definitions of these values are generalised to include support for
   the interchange and extended types defined in ISO/IEC TS 18661-3.
   Prior to this (for C99/C11) the definitions were:

	-1  indeterminate
	 0  evaluate all operations and constants just to the range and
	    precision of the type
	 1  evaluate operations and constants of type float and double
	    to the range and precision of the double type, evaluate
	    long double operations and constants to the range and
	    precision of the long double type
	 2  evaluate all operations and constants to the range and
	    precision of the long double type

   The TS 18661-3 definitions are:

	-1  indeterminate
	 0  evaluate all operations and constants, whose semantic type has
	    at most the range and precision of float, to the range and
	    precision of float; evaluate all other operations and constants
	    to the range and precision of the semantic type.
	 1  evaluate all operations and constants, whose semantic type has
	    at most the range and precision of double, to the range and
	    precision of double; evaluate all other operations and constants
	    to the range and precision of the semantic type.
	 2  evaluate all operations and constants, whose semantic type has
	    at most the range and precision of long double, to the range and
	    precision of long double; evaluate all other operations and
	    constants to the range and precision of the semantic type.
	 N  where _FloatN  is a supported interchange floating type
	    evaluate all operations and constants, whose semantic type has
	    at most the range and precision of the _FloatN type, to the
	    range and precision of the _FloatN type; evaluate all other
	    operations and constants to the range and precision of the
	    semantic type.
	 N + 1, where _FloatNx is a supported extended floating type
	    evaluate operations and constants, whose semantic type has at
	    most the range and precision of the _FloatNx type, to the range
	    and precision of the _FloatNx type; evaluate all other
	    operations and constants to the range and precision of the
	    semantic type.

   The compiler predefines two macros:

      __FLT_EVAL_METHOD__
      Which, depending on the value given for
      -fpermitted-flt-eval-methods, may be limited to only those values
      for FLT_EVAL_METHOD defined in C99/C11.

     __FLT_EVAL_METHOD_TS_18661_3__
      Which always permits the values for FLT_EVAL_METHOD defined in
      ISO/IEC TS 18661-3.

     Here we want to use __FLT_EVAL_METHOD__, unless
     __STDC_WANT_IEC_60559_TYPES_EXT__ is defined, in which case the user
     is specifically asking for the ISO/IEC TS 18661-3 types, so we use
     __FLT_EVAL_METHOD_TS_18661_3__.

   ??? This ought to change with the setting of the fp control word;
   the value provided by the compiler assumes the widest setting.  */
#undef FLT_EVAL_METHOD
#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
#define FLT_EVAL_METHOD __FLT_EVAL_METHOD_TS_18661_3__
#else
#define FLT_EVAL_METHOD	__FLT_EVAL_METHOD__
#endif

/* Number of decimal digits, n, such that any floating-point number in the
   widest supported floating type with pmax radix b digits can be rounded
   to a floating-point number with n decimal digits and back again without
   change to the value,

	pmax * log10(b)			if b is a power of 10
	ceil(1 + pmax * log10(b))	otherwise
*/
#undef DECIMAL_DIG
#define DECIMAL_DIG	__DECIMAL_DIG__

#endif /* C99 */

#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
/* Versions of DECIMAL_DIG for each floating-point type.  */
#undef FLT_DECIMAL_DIG
#undef DBL_DECIMAL_DIG
#undef LDBL_DECIMAL_DIG
#define FLT_DECIMAL_DIG		__FLT_DECIMAL_DIG__
#define DBL_DECIMAL_DIG		__DBL_DECIMAL_DIG__
#define LDBL_DECIMAL_DIG	__LDBL_DECIMAL_DIG__

/* Whether types support subnormal numbers.  */
#undef FLT_HAS_SUBNORM
#undef DBL_HAS_SUBNORM
#undef LDBL_HAS_SUBNORM
#define FLT_HAS_SUBNORM		__FLT_HAS_DENORM__
#define DBL_HAS_SUBNORM		__DBL_HAS_DENORM__
#define LDBL_HAS_SUBNORM	__LDBL_HAS_DENORM__

/* Minimum positive values, including subnormals.  */
#undef FLT_TRUE_MIN
#undef DBL_TRUE_MIN
#undef LDBL_TRUE_MIN
#define FLT_TRUE_MIN	__FLT_DENORM_MIN__
#define DBL_TRUE_MIN	__DBL_DENORM_MIN__
#define LDBL_TRUE_MIN	__LDBL_DENORM_MIN__

#endif /* C11 */

#ifdef __STDC_WANT_IEC_60559_BFP_EXT__
/* Number of decimal digits for which conversions between decimal
   character strings and binary formats, in both directions, are
   correctly rounded.  */
#define CR_DECIMAL_DIG	__UINTMAX_MAX__
#endif

#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
/* Constants for _FloatN and _FloatNx types from TS 18661-3.  See
   comments above for their semantics.  */

#ifdef __FLT16_MANT_DIG__
#undef FLT16_MANT_DIG
#define FLT16_MANT_DIG		__FLT16_MANT_DIG__
#undef FLT16_DIG
#define FLT16_DIG		__FLT16_DIG__
#undef FLT16_MIN_EXP
#define FLT16_MIN_EXP		__FLT16_MIN_EXP__
#undef FLT16_MIN_10_EXP
#define FLT16_MIN_10_EXP	__FLT16_MIN_10_EXP__
#undef FLT16_MAX_EXP
#define FLT16_MAX_EXP		__FLT16_MAX_EXP__
#undef FLT16_MAX_10_EXP
#define FLT16_MAX_10_EXP	__FLT16_MAX_10_EXP__
#undef FLT16_MAX
#define FLT16_MAX		__FLT16_MAX__
#undef FLT16_EPSILON
#define FLT16_EPSILON		__FLT16_EPSILON__
#undef FLT16_MIN
#define FLT16_MIN		__FLT16_MIN__
#undef FLT16_DECIMAL_DIG
#define FLT16_DECIMAL_DIG	__FLT16_DECIMAL_DIG__
#undef FLT16_TRUE_MIN
#define FLT16_TRUE_MIN		__FLT16_DENORM_MIN__
#endif /* __FLT16_MANT_DIG__.  */

#ifdef __FLT32_MANT_DIG__
#undef FLT32_MANT_DIG
#define FLT32_MANT_DIG		__FLT32_MANT_DIG__
#undef FLT32_DIG
#define FLT32_DIG		__FLT32_DIG__
#undef FLT32_MIN_EXP
#define FLT32_MIN_EXP		__FLT32_MIN_EXP__
#undef FLT32_MIN_10_EXP
#define FLT32_MIN_10_EXP	__FLT32_MIN_10_EXP__
#undef FLT32_MAX_EXP
#define FLT32_MAX_EXP		__FLT32_MAX_EXP__
#undef FLT32_MAX_10_EXP
#define FLT32_MAX_10_EXP	__FLT32_MAX_10_EXP__
#undef FLT32_MAX
#define FLT32_MAX		__FLT32_MAX__
#undef FLT32_EPSILON
#define FLT32_EPSILON		__FLT32_EPSILON__
#undef FLT32_MIN
#define FLT32_MIN		__FLT32_MIN__
#undef FLT32_DECIMAL_DIG
#define FLT32_DECIMAL_DIG	__FLT32_DECIMAL_DIG__
#undef FLT32_TRUE_MIN
#define FLT32_TRUE_MIN		__FLT32_DENORM_MIN__
#endif /* __FLT32_MANT_DIG__.  */

#ifdef __FLT64_MANT_DIG__
#undef FLT64_MANT_DIG
#define FLT64_MANT_DIG		__FLT64_MANT_DIG__
#undef FLT64_DIG
#define FLT64_DIG		__FLT64_DIG__
#undef FLT64_MIN_EXP
#define FLT64_MIN_EXP		__FLT64_MIN_EXP__
#undef FLT64_MIN_10_EXP
#define FLT64_MIN_10_EXP	__FLT64_MIN_10_EXP__
#undef FLT64_MAX_EXP
#define FLT64_MAX_EXP		__FLT64_MAX_EXP__
#undef FLT64_MAX_10_EXP
#define FLT64_MAX_10_EXP	__FLT64_MAX_10_EXP__
#undef FLT64_MAX
#define FLT64_MAX		__FLT64_MAX__
#undef FLT64_EPSILON
#define FLT64_EPSILON		__FLT64_EPSILON__
#undef FLT64_MIN
#define FLT64_MIN		__FLT64_MIN__
#undef FLT64_DECIMAL_DIG
#define FLT64_DECIMAL_DIG	__FLT64_DECIMAL_DIG__
#undef FLT64_TRUE_MIN
#define FLT64_TRUE_MIN		__FLT64_DENORM_MIN__
#endif /* __FLT64_MANT_DIG__.  */

#ifdef __FLT128_MANT_DIG__
#undef FLT128_MANT_DIG
#define FLT128_MANT_DIG		__FLT128_MANT_DIG__
#undef FLT128_DIG
#define FLT128_DIG		__FLT128_DIG__
#undef FLT128_MIN_EXP
#define FLT128_MIN_EXP		__FLT128_MIN_EXP__
#undef FLT128_MIN_10_EXP
#define FLT128_MIN_10_EXP	__FLT128_MIN_10_EXP__
#undef FLT128_MAX_EXP
#define FLT128_MAX_EXP		__FLT128_MAX_EXP__
#undef FLT128_MAX_10_EXP
#define FLT128_MAX_10_EXP	__FLT128_MAX_10_EXP__
#undef FLT128_MAX
#define FLT128_MAX		__FLT128_MAX__
#undef FLT128_EPSILON
#define FLT128_EPSILON		__FLT128_EPSILON__
#undef FLT128_MIN
#define FLT128_MIN		__FLT128_MIN__
#undef FLT128_DECIMAL_DIG
#define FLT128_DECIMAL_DIG	__FLT128_DECIMAL_DIG__
#undef FLT128_TRUE_MIN
#define FLT128_TRUE_MIN		__FLT128_DENORM_MIN__
#endif /* __FLT128_MANT_DIG__.  */

#ifdef __FLT32X_MANT_DIG__
#undef FLT32X_MANT_DIG
#define FLT32X_MANT_DIG		__FLT32X_MANT_DIG__
#undef FLT32X_DIG
#define FLT32X_DIG		__FLT32X_DIG__
#undef FLT32X_MIN_EXP
#define FLT32X_MIN_EXP		__FLT32X_MIN_EXP__
#undef FLT32X_MIN_10_EXP
#define FLT32X_MIN_10_EXP	__FLT32X_MIN_10_EXP__
#undef FLT32X_MAX_EXP
#define FLT32X_MAX_EXP		__FLT32X_MAX_EXP__
#undef FLT32X_MAX_10_EXP
#define FLT32X_MAX_10_EXP	__FLT32X_MAX_10_EXP__
#undef FLT32X_MAX
#define FLT32X_MAX		__FLT32X_MAX__
#undef FLT32X_EPSILON
#define FLT32X_EPSILON		__FLT32X_EPSILON__
#undef FLT32X_MIN
#define FLT32X_MIN		__FLT32X_MIN__
#undef FLT32X_DECIMAL_DIG
#define FLT32X_DECIMAL_DIG	__FLT32X_DECIMAL_DIG__
#undef FLT32X_TRUE_MIN
#define FLT32X_TRUE_MIN		__FLT32X_DENORM_MIN__
#endif /* __FLT32X_MANT_DIG__.  */

#ifdef __FLT64X_MANT_DIG__
#undef FLT64X_MANT_DIG
#define FLT64X_MANT_DIG		__FLT64X_MANT_DIG__
#undef FLT64X_DIG
#define FLT64X_DIG		__FLT64X_DIG__
#undef FLT64X_MIN_EXP
#define FLT64X_MIN_EXP		__FLT64X_MIN_EXP__
#undef FLT64X_MIN_10_EXP
#define FLT64X_MIN_10_EXP	__FLT64X_MIN_10_EXP__
#undef FLT64X_MAX_EXP
#define FLT64X_MAX_EXP		__FLT64X_MAX_EXP__
#undef FLT64X_MAX_10_EXP
#define FLT64X_MAX_10_EXP	__FLT64X_MAX_10_EXP__
#undef FLT64X_MAX
#define FLT64X_MAX		__FLT64X_MAX__
#undef FLT64X_EPSILON
#define FLT64X_EPSILON		__FLT64X_EPSILON__
#undef FLT64X_MIN
#define FLT64X_MIN		__FLT64X_MIN__
#undef FLT64X_DECIMAL_DIG
#define FLT64X_DECIMAL_DIG	__FLT64X_DECIMAL_DIG__
#undef FLT64X_TRUE_MIN
#define FLT64X_TRUE_MIN		__FLT64X_DENORM_MIN__
#endif /* __FLT64X_MANT_DIG__.  */

#ifdef __FLT128X_MANT_DIG__
#undef FLT128X_MANT_DIG
#define FLT128X_MANT_DIG	__FLT128X_MANT_DIG__
#undef FLT128X_DIG
#define FLT128X_DIG		__FLT128X_DIG__
#undef FLT128X_MIN_EXP
#define FLT128X_MIN_EXP		__FLT128X_MIN_EXP__
#undef FLT128X_MIN_10_EXP
#define FLT128X_MIN_10_EXP	__FLT128X_MIN_10_EXP__
#undef FLT128X_MAX_EXP
#define FLT128X_MAX_EXP		__FLT128X_MAX_EXP__
#undef FLT128X_MAX_10_EXP
#define FLT128X_MAX_10_EXP	__FLT128X_MAX_10_EXP__
#undef FLT128X_MAX
#define FLT128X_MAX		__FLT128X_MAX__
#undef FLT128X_EPSILON
#define FLT128X_EPSILON		__FLT128X_EPSILON__
#undef FLT128X_MIN
#define FLT128X_MIN		__FLT128X_MIN__
#undef FLT128X_DECIMAL_DIG
#define FLT128X_DECIMAL_DIG	__FLT128X_DECIMAL_DIG__
#undef FLT128X_TRUE_MIN
#define FLT128X_TRUE_MIN	__FLT128X_DENORM_MIN__
#endif /* __FLT128X_MANT_DIG__.  */

#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__.  */

#ifdef __STDC_WANT_DEC_FP__
/* Draft Technical Report 24732, extension for decimal floating-point
   arithmetic: Characteristic of decimal floating types <float.h>.  */

/* Number of base-FLT_RADIX digits in the significand, p.  */
#undef DEC32_MANT_DIG
#undef DEC64_MANT_DIG
#undef DEC128_MANT_DIG
#define DEC32_MANT_DIG	__DEC32_MANT_DIG__
#define DEC64_MANT_DIG	__DEC64_MANT_DIG__
#define DEC128_MANT_DIG	__DEC128_MANT_DIG__

/* Minimum exponent. */
#undef DEC32_MIN_EXP
#undef DEC64_MIN_EXP
#undef DEC128_MIN_EXP
#define DEC32_MIN_EXP	__DEC32_MIN_EXP__
#define DEC64_MIN_EXP	__DEC64_MIN_EXP__
#define DEC128_MIN_EXP	__DEC128_MIN_EXP__

/* Maximum exponent. */
#undef DEC32_MAX_EXP
#undef DEC64_MAX_EXP
#undef DEC128_MAX_EXP
#define DEC32_MAX_EXP	__DEC32_MAX_EXP__
#define DEC64_MAX_EXP	__DEC64_MAX_EXP__
#define DEC128_MAX_EXP	__DEC128_MAX_EXP__

/* Maximum representable finite decimal floating-point number
   (there are 6, 15, and 33 9s after the decimal points respectively). */
#undef DEC32_MAX
#undef DEC64_MAX
#undef DEC128_MAX
#define DEC32_MAX   __DEC32_MAX__
#define DEC64_MAX   __DEC64_MAX__
#define DEC128_MAX  __DEC128_MAX__

/* The difference between 1 and the least value greater than 1 that is
   representable in the given floating point type. */
#undef DEC32_EPSILON
#undef DEC64_EPSILON
#undef DEC128_EPSILON
#define DEC32_EPSILON	__DEC32_EPSILON__
#define DEC64_EPSILON	__DEC64_EPSILON__
#define DEC128_EPSILON	__DEC128_EPSILON__

/* Minimum normalized positive floating-point number. */
#undef DEC32_MIN
#undef DEC64_MIN
#undef DEC128_MIN
#define DEC32_MIN	__DEC32_MIN__
#define DEC64_MIN	__DEC64_MIN__
#define DEC128_MIN	__DEC128_MIN__

/* Minimum subnormal positive floating-point number. */
#undef DEC32_SUBNORMAL_MIN
#undef DEC64_SUBNORMAL_MIN
#undef DEC128_SUBNORMAL_MIN
#define DEC32_SUBNORMAL_MIN       __DEC32_SUBNORMAL_MIN__
#define DEC64_SUBNORMAL_MIN       __DEC64_SUBNORMAL_MIN__
#define DEC128_SUBNORMAL_MIN      __DEC128_SUBNORMAL_MIN__

/* The floating-point expression evaluation method.
         -1  indeterminate
         0  evaluate all operations and constants just to the range and
            precision of the type
         1  evaluate operations and constants of type _Decimal32 
	    and _Decimal64 to the range and precision of the _Decimal64 
            type, evaluate _Decimal128 operations and constants to the 
	    range and precision of the _Decimal128 type;
	 2  evaluate all operations and constants to the range and
	    precision of the _Decimal128 type.  */

#undef DEC_EVAL_METHOD
#define DEC_EVAL_METHOD	__DEC_EVAL_METHOD__

#endif /* __STDC_WANT_DEC_FP__ */

#endif /* _FLOAT_H___ */
/* ELF program property for Intel CET.
   Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 3, or (at your option) any
   later version.

   This file is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.
 */

/* Add x86 feature with IBT and/or SHSTK bits to ELF program property
   if they are enabled.  Otherwise, contents in this header file are
   unused.  Define _CET_ENDBR for assembly codes.  _CET_ENDBR should be
   placed unconditionally at the entrance of a function whose address
   may be taken.  */

#ifndef _CET_H_INCLUDED
#define _CET_H_INCLUDED

#ifdef __ASSEMBLER__

# if defined __CET__ && (__CET__ & 1) != 0
#  ifdef __x86_64__
#   define _CET_ENDBR endbr64
#  else
#   define _CET_ENDBR endbr32
#  endif
# else
#  define _CET_ENDBR
# endif

# ifdef __ELF__
#  ifdef __CET__
#   if (__CET__ & 1) != 0
/* GNU_PROPERTY_X86_FEATURE_1_IBT.  */
#    define __PROPERTY_IBT 0x1
#   else
#    define __PROPERTY_IBT 0x0
#   endif

#   if (__CET__ & 2) != 0
/* GNU_PROPERTY_X86_FEATURE_1_SHSTK.  */
#    define __PROPERTY_SHSTK 0x2
#   else
#    define __PROPERTY_SHSTK 0x0
#   endif

#   define __PROPERTY_BITS (__PROPERTY_IBT | __PROPERTY_SHSTK)

#   ifdef __LP64__
#    define __PROPERTY_ALIGN 3
#   else
#    define __PROPERTY_ALIGN 2
#   endif

	.pushsection ".note.gnu.property", "a"
	.p2align __PROPERTY_ALIGN
	.long 1f - 0f		/* name length.  */
	.long 4f - 1f		/* data length.  */
	/* NT_GNU_PROPERTY_TYPE_0.   */
	.long 5			/* note type.  */
0:
	.asciz "GNU"		/* vendor name.  */
1:
	.p2align __PROPERTY_ALIGN
	/* GNU_PROPERTY_X86_FEATURE_1_AND.  */
	.long 0xc0000002	/* pr_type.  */
	.long 3f - 2f		/* pr_datasz.  */
2:
	/* GNU_PROPERTY_X86_FEATURE_1_XXX.  */
	.long __PROPERTY_BITS
3:
	.p2align __PROPERTY_ALIGN
4:
	.popsection
#  endif /* __CET__ */
# endif /* __ELF__ */
#endif /* __ASSEMBLER__ */

#endif /* _CET_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512ERINTRIN_H_INCLUDED
#define _AVX512ERINTRIN_H_INCLUDED

#ifndef __AVX512ER__
#pragma GCC push_options
#pragma GCC target("avx512er")
#define __DISABLE_AVX512ER__
#endif /* __AVX512ER__ */

/* Internal data types for implementing the intrinsics.  */
typedef double __v8df __attribute__ ((__vector_size__ (64)));
typedef float __v16sf __attribute__ ((__vector_size__ (64)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));

typedef unsigned char  __mmask8;
typedef unsigned short __mmask16;

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_exp2a23_round_pd (__m512d __A, int __R)
{
  __m512d __W;
  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
					       (__v8df) __W,
					       (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
					       (__v8df) __W,
					       (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
					       (__v8df) _mm512_setzero_pd (),
					       (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_exp2a23_round_ps (__m512 __A, int __R)
{
  __m512 __W;
  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
					      (__v16sf) __W,
					      (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
					      (__v16sf) __W,
					      (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
					      (__v16sf) _mm512_setzero_ps (),
					      (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp28_round_pd (__m512d __A, int __R)
{
  __m512d __W;
  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
						(__v8df) __W,
						(__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
						(__v8df) __W,
						(__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
						(__v8df) _mm512_setzero_pd (),
						(__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp28_round_ps (__m512 __A, int __R)
{
  __m512 __W;
  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
					       (__v16sf) __W,
					       (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
					       (__v16sf) __W,
					       (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
					       (__v16sf) _mm512_setzero_ps (),
					       (__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
{
  return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B,
						 (__v2df) __A,
						 __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
{
  return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B,
						(__v4sf) __A,
						__R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt28_round_pd (__m512d __A, int __R)
{
  __m512d __W;
  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
						  (__v8df) __W,
						  (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
						  (__v8df) __W,
						  (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R)
{
  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
						  (__v8df) _mm512_setzero_pd (),
						  (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt28_round_ps (__m512 __A, int __R)
{
  __m512 __W;
  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
						 (__v16sf) __W,
						 (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
						 (__v16sf) __W,
						 (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
{
  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
						 (__v16sf) _mm512_setzero_ps (),
						 (__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
{
  return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B,
						   (__v2df) __A,
						   __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R)
{
  return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B,
						  (__v4sf) __A,
						  __R);
}

#else
#define _mm512_exp2a23_round_pd(A, C)            \
    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)

#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \
    __builtin_ia32_exp2pd_mask(A, W, U, C)

#define _mm512_maskz_exp2a23_round_pd(U, A, C)   \
    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_exp2a23_round_ps(A, C)            \
    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)

#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \
    __builtin_ia32_exp2ps_mask(A, W, U, C)

#define _mm512_maskz_exp2a23_round_ps(U, A, C)   \
    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm512_rcp28_round_pd(A, C)            \
    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)

#define _mm512_mask_rcp28_round_pd(W, U, A, C) \
    __builtin_ia32_rcp28pd_mask(A, W, U, C)

#define _mm512_maskz_rcp28_round_pd(U, A, C)   \
    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_rcp28_round_ps(A, C)            \
    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)

#define _mm512_mask_rcp28_round_ps(W, U, A, C) \
    __builtin_ia32_rcp28ps_mask(A, W, U, C)

#define _mm512_maskz_rcp28_round_ps(U, A, C)   \
    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm512_rsqrt28_round_pd(A, C)            \
    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)

#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \
    __builtin_ia32_rsqrt28pd_mask(A, W, U, C)

#define _mm512_maskz_rsqrt28_round_pd(U, A, C)   \
    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_rsqrt28_round_ps(A, C)            \
    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)

#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \
    __builtin_ia32_rsqrt28ps_mask(A, W, U, C)

#define _mm512_maskz_rsqrt28_round_ps(U, A, C)   \
    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm_rcp28_round_sd(A, B, R)	\
    __builtin_ia32_rcp28sd_round(A, B, R)

#define _mm_rcp28_round_ss(A, B, R)	\
    __builtin_ia32_rcp28ss_round(A, B, R)

#define _mm_rsqrt28_round_sd(A, B, R)	\
    __builtin_ia32_rsqrt28sd_round(A, B, R)

#define _mm_rsqrt28_round_ss(A, B, R)	\
    __builtin_ia32_rsqrt28ss_round(A, B, R)

#endif

#define _mm512_exp2a23_pd(A)                    \
    _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_exp2a23_pd(W, U, A)   \
    _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_exp2a23_pd(U, A)     \
    _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_exp2a23_ps(A)                    \
    _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_exp2a23_ps(W, U, A)   \
    _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_exp2a23_ps(U, A)     \
    _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_rcp28_pd(A)                    \
    _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_rcp28_pd(W, U, A)   \
    _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_rcp28_pd(U, A)     \
    _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_rcp28_ps(A)                    \
    _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_rcp28_ps(W, U, A)   \
    _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_rcp28_ps(U, A)     \
    _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_rsqrt28_pd(A)                    \
    _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_rsqrt28_pd(W, U, A)   \
    _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_rsqrt28_pd(U, A)     \
    _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_rsqrt28_ps(A)                    \
    _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_mask_rsqrt28_ps(W, U, A)   \
    _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm512_maskz_rsqrt28_ps(U, A)     \
    _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)

#define _mm_rcp28_sd(A, B)	\
    __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)

#define _mm_rcp28_ss(A, B)	\
    __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)

#define _mm_rsqrt28_sd(A, B)	\
    __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)

#define _mm_rsqrt28_ss(A, B)	\
    __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)

#ifdef __DISABLE_AVX512ER__
#undef __DISABLE_AVX512ER__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512ER__ */

#endif /* _AVX512ERINTRIN_H_INCLUDED */
/* Copyright (C) 2011-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _BMI2INTRIN_H_INCLUDED
#define _BMI2INTRIN_H_INCLUDED

#ifndef __BMI2__
#pragma GCC push_options
#pragma GCC target("bmi2")
#define __DISABLE_BMI2__
#endif /* __BMI2__ */

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32 (unsigned int __X, unsigned int __Y)
{
  return __builtin_ia32_bzhi_si (__X, __Y);
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32 (unsigned int __X, unsigned int __Y)
{
  return __builtin_ia32_pdep_si (__X, __Y);
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32 (unsigned int __X, unsigned int __Y)
{
  return __builtin_ia32_pext_si (__X, __Y);
}

#ifdef  __x86_64__

extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
{
  return __builtin_ia32_bzhi_di (__X, __Y);
}

extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64 (unsigned long long __X, unsigned long long __Y)
{
  return __builtin_ia32_pdep_di (__X, __Y);
}

extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64 (unsigned long long __X, unsigned long long __Y)
{
  return __builtin_ia32_pext_di (__X, __Y);
}

extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64 (unsigned long long __X, unsigned long long __Y,
	   unsigned long long *__P)
{
  unsigned __int128 __res = (unsigned __int128) __X * __Y;
  *__P = (unsigned long long) (__res >> 64);
  return (unsigned long long) __res;
}

#else /* !__x86_64__ */

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
{
  unsigned long long __res = (unsigned long long) __X * __Y;
  *__P = (unsigned int) (__res >> 32);
  return (unsigned int) __res;
}

#endif /* !__x86_64__  */

#ifdef __DISABLE_BMI2__
#undef __DISABLE_BMI2__
#pragma GCC pop_options
#endif /* __DISABLE_BMI2__ */

#endif /* _BMI2INTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* ISO C11 Standard:  7.17  Atomics <stdatomic.h>.  */

#ifndef _STDATOMIC_H
#define _STDATOMIC_H

typedef enum
  {
    memory_order_relaxed = __ATOMIC_RELAXED,
    memory_order_consume = __ATOMIC_CONSUME,
    memory_order_acquire = __ATOMIC_ACQUIRE,
    memory_order_release = __ATOMIC_RELEASE,
    memory_order_acq_rel = __ATOMIC_ACQ_REL,
    memory_order_seq_cst = __ATOMIC_SEQ_CST
  } memory_order;


typedef _Atomic _Bool atomic_bool;
typedef _Atomic char atomic_char;
typedef _Atomic signed char atomic_schar;
typedef _Atomic unsigned char atomic_uchar;
typedef _Atomic short atomic_short;
typedef _Atomic unsigned short atomic_ushort;
typedef _Atomic int atomic_int;
typedef _Atomic unsigned int atomic_uint;
typedef _Atomic long atomic_long;
typedef _Atomic unsigned long atomic_ulong;
typedef _Atomic long long atomic_llong;
typedef _Atomic unsigned long long atomic_ullong;
typedef _Atomic __CHAR16_TYPE__ atomic_char16_t;
typedef _Atomic __CHAR32_TYPE__ atomic_char32_t;
typedef _Atomic __WCHAR_TYPE__ atomic_wchar_t;
typedef _Atomic __INT_LEAST8_TYPE__ atomic_int_least8_t;
typedef _Atomic __UINT_LEAST8_TYPE__ atomic_uint_least8_t;
typedef _Atomic __INT_LEAST16_TYPE__ atomic_int_least16_t;
typedef _Atomic __UINT_LEAST16_TYPE__ atomic_uint_least16_t;
typedef _Atomic __INT_LEAST32_TYPE__ atomic_int_least32_t;
typedef _Atomic __UINT_LEAST32_TYPE__ atomic_uint_least32_t;
typedef _Atomic __INT_LEAST64_TYPE__ atomic_int_least64_t;
typedef _Atomic __UINT_LEAST64_TYPE__ atomic_uint_least64_t;
typedef _Atomic __INT_FAST8_TYPE__ atomic_int_fast8_t;
typedef _Atomic __UINT_FAST8_TYPE__ atomic_uint_fast8_t;
typedef _Atomic __INT_FAST16_TYPE__ atomic_int_fast16_t;
typedef _Atomic __UINT_FAST16_TYPE__ atomic_uint_fast16_t;
typedef _Atomic __INT_FAST32_TYPE__ atomic_int_fast32_t;
typedef _Atomic __UINT_FAST32_TYPE__ atomic_uint_fast32_t;
typedef _Atomic __INT_FAST64_TYPE__ atomic_int_fast64_t;
typedef _Atomic __UINT_FAST64_TYPE__ atomic_uint_fast64_t;
typedef _Atomic __INTPTR_TYPE__ atomic_intptr_t;
typedef _Atomic __UINTPTR_TYPE__ atomic_uintptr_t;
typedef _Atomic __SIZE_TYPE__ atomic_size_t;
typedef _Atomic __PTRDIFF_TYPE__ atomic_ptrdiff_t;
typedef _Atomic __INTMAX_TYPE__ atomic_intmax_t;
typedef _Atomic __UINTMAX_TYPE__ atomic_uintmax_t;        


#define ATOMIC_VAR_INIT(VALUE)	(VALUE)

/* Initialize an atomic object pointed to by PTR with VAL.  */
#define atomic_init(PTR, VAL)                           \
  atomic_store_explicit (PTR, VAL, __ATOMIC_RELAXED)

#define kill_dependency(Y)			\
  __extension__					\
  ({						\
    __auto_type __kill_dependency_tmp = (Y);	\
    __kill_dependency_tmp;			\
  })

extern void atomic_thread_fence (memory_order);
#define atomic_thread_fence(MO)	__atomic_thread_fence (MO)
extern void atomic_signal_fence (memory_order);
#define atomic_signal_fence(MO)	__atomic_signal_fence  (MO)
#define atomic_is_lock_free(OBJ) __atomic_is_lock_free (sizeof (*(OBJ)), (OBJ))

#define ATOMIC_BOOL_LOCK_FREE		__GCC_ATOMIC_BOOL_LOCK_FREE
#define ATOMIC_CHAR_LOCK_FREE		__GCC_ATOMIC_CHAR_LOCK_FREE
#define ATOMIC_CHAR16_T_LOCK_FREE	__GCC_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE	__GCC_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE	__GCC_ATOMIC_WCHAR_T_LOCK_FREE
#define ATOMIC_SHORT_LOCK_FREE		__GCC_ATOMIC_SHORT_LOCK_FREE
#define ATOMIC_INT_LOCK_FREE		__GCC_ATOMIC_INT_LOCK_FREE
#define ATOMIC_LONG_LOCK_FREE		__GCC_ATOMIC_LONG_LOCK_FREE
#define ATOMIC_LLONG_LOCK_FREE		__GCC_ATOMIC_LLONG_LOCK_FREE
#define ATOMIC_POINTER_LOCK_FREE	__GCC_ATOMIC_POINTER_LOCK_FREE


/* Note that these macros require __typeof__ and __auto_type to remove
   _Atomic qualifiers (and const qualifiers, if those are valid on
   macro operands).
   
   Also note that the header file uses the generic form of __atomic
   builtins, which requires the address to be taken of the value
   parameter, and then we pass that value on.  This allows the macros
   to work for any type, and the compiler is smart enough to convert
   these to lock-free _N variants if possible, and throw away the
   temps.  */

#define atomic_store_explicit(PTR, VAL, MO)				\
  __extension__								\
  ({									\
    __auto_type __atomic_store_ptr = (PTR);				\
    __typeof__ (*__atomic_store_ptr) __atomic_store_tmp = (VAL);	\
    __atomic_store (__atomic_store_ptr, &__atomic_store_tmp, (MO));	\
  })

#define atomic_store(PTR, VAL)				\
  atomic_store_explicit (PTR, VAL, __ATOMIC_SEQ_CST)


#define atomic_load_explicit(PTR, MO)					\
  __extension__								\
  ({									\
    __auto_type __atomic_load_ptr = (PTR);				\
    __typeof__ (*__atomic_load_ptr) __atomic_load_tmp;			\
    __atomic_load (__atomic_load_ptr, &__atomic_load_tmp, (MO));	\
    __atomic_load_tmp;							\
  })

#define atomic_load(PTR)  atomic_load_explicit (PTR, __ATOMIC_SEQ_CST)


#define atomic_exchange_explicit(PTR, VAL, MO)				\
  __extension__								\
  ({									\
    __auto_type __atomic_exchange_ptr = (PTR);				\
    __typeof__ (*__atomic_exchange_ptr) __atomic_exchange_val = (VAL);	\
    __typeof__ (*__atomic_exchange_ptr) __atomic_exchange_tmp;		\
    __atomic_exchange (__atomic_exchange_ptr, &__atomic_exchange_val,	\
		       &__atomic_exchange_tmp, (MO));			\
    __atomic_exchange_tmp;						\
  })

#define atomic_exchange(PTR, VAL) 			\
  atomic_exchange_explicit (PTR, VAL, __ATOMIC_SEQ_CST)


#define atomic_compare_exchange_strong_explicit(PTR, VAL, DES, SUC, FAIL) \
  __extension__								\
  ({									\
    __auto_type __atomic_compare_exchange_ptr = (PTR);			\
    __typeof__ (*__atomic_compare_exchange_ptr) __atomic_compare_exchange_tmp \
      = (DES);								\
    __atomic_compare_exchange (__atomic_compare_exchange_ptr, (VAL),	\
			       &__atomic_compare_exchange_tmp, 0,	\
			       (SUC), (FAIL));				\
  })

#define atomic_compare_exchange_strong(PTR, VAL, DES) 			   \
  atomic_compare_exchange_strong_explicit (PTR, VAL, DES, __ATOMIC_SEQ_CST, \
					   __ATOMIC_SEQ_CST)

#define atomic_compare_exchange_weak_explicit(PTR, VAL, DES, SUC, FAIL) \
  __extension__								\
  ({									\
    __auto_type __atomic_compare_exchange_ptr = (PTR);			\
    __typeof__ (*__atomic_compare_exchange_ptr) __atomic_compare_exchange_tmp \
      = (DES);								\
    __atomic_compare_exchange (__atomic_compare_exchange_ptr, (VAL),	\
			       &__atomic_compare_exchange_tmp, 1,	\
			       (SUC), (FAIL));				\
  })

#define atomic_compare_exchange_weak(PTR, VAL, DES)			\
  atomic_compare_exchange_weak_explicit (PTR, VAL, DES, __ATOMIC_SEQ_CST, \
					 __ATOMIC_SEQ_CST)



#define atomic_fetch_add(PTR, VAL) __atomic_fetch_add ((PTR), (VAL), 	\
						       __ATOMIC_SEQ_CST)
#define atomic_fetch_add_explicit(PTR, VAL, MO) 			\
			  __atomic_fetch_add ((PTR), (VAL), (MO))

#define atomic_fetch_sub(PTR, VAL) __atomic_fetch_sub ((PTR), (VAL), 	\
						       __ATOMIC_SEQ_CST)
#define atomic_fetch_sub_explicit(PTR, VAL, MO) 			\
			  __atomic_fetch_sub ((PTR), (VAL), (MO))

#define atomic_fetch_or(PTR, VAL) __atomic_fetch_or ((PTR), (VAL), 	\
						       __ATOMIC_SEQ_CST)
#define atomic_fetch_or_explicit(PTR, VAL, MO) 			\
			  __atomic_fetch_or ((PTR), (VAL), (MO))

#define atomic_fetch_xor(PTR, VAL) __atomic_fetch_xor ((PTR), (VAL), 	\
						       __ATOMIC_SEQ_CST)
#define atomic_fetch_xor_explicit(PTR, VAL, MO) 			\
			  __atomic_fetch_xor ((PTR), (VAL), (MO))

#define atomic_fetch_and(PTR, VAL) __atomic_fetch_and ((PTR), (VAL), 	\
						       __ATOMIC_SEQ_CST)
#define atomic_fetch_and_explicit(PTR, VAL, MO) 			\
			  __atomic_fetch_and ((PTR), (VAL), (MO))


typedef _Atomic struct
{
#if __GCC_ATOMIC_TEST_AND_SET_TRUEVAL == 1
  _Bool __val;
#else
  unsigned char __val;
#endif
} atomic_flag;

#define ATOMIC_FLAG_INIT	{ 0 }


extern _Bool atomic_flag_test_and_set (volatile atomic_flag *);
#define atomic_flag_test_and_set(PTR) 					\
			__atomic_test_and_set ((PTR), __ATOMIC_SEQ_CST)
extern _Bool atomic_flag_test_and_set_explicit (volatile atomic_flag *,
						memory_order);
#define atomic_flag_test_and_set_explicit(PTR, MO)			\
			__atomic_test_and_set ((PTR), (MO))

extern void atomic_flag_clear (volatile atomic_flag *);
#define atomic_flag_clear(PTR)	__atomic_clear ((PTR), __ATOMIC_SEQ_CST)
extern void atomic_flag_clear_explicit (volatile atomic_flag *, memory_order);
#define atomic_flag_clear_explicit(PTR, MO)   __atomic_clear ((PTR), (MO))

#endif  /* _STDATOMIC_H */
#ifndef _GCC_WRAP_STDINT_H
#if __STDC_HOSTED__
# if defined __cplusplus && __cplusplus >= 201103L
#  undef __STDC_LIMIT_MACROS
#  define __STDC_LIMIT_MACROS
#  undef __STDC_CONSTANT_MACROS
#  define __STDC_CONSTANT_MACROS
# endif
# include_next <stdint.h>
#else
# include "stdint-gcc.h"
#endif
#define _GCC_WRAP_STDINT_H
#endif
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
# error "Never use <adxintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _ADXINTRIN_H_INCLUDED
#define _ADXINTRIN_H_INCLUDED

extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_subborrow_u32 (unsigned char __CF, unsigned int __X,
		unsigned int __Y, unsigned int *__P)
{
  return __builtin_ia32_sbb_u32 (__CF, __X, __Y, __P);
}

extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarry_u32 (unsigned char __CF, unsigned int __X,
	       unsigned int __Y, unsigned int *__P)
{
  return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
}

extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarryx_u32 (unsigned char __CF, unsigned int __X,
		unsigned int __Y, unsigned int *__P)
{
  return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
}

#ifdef __x86_64__
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_subborrow_u64 (unsigned char __CF, unsigned long long __X,
		unsigned long long __Y, unsigned long long *__P)
{
  return __builtin_ia32_sbb_u64 (__CF, __X, __Y, __P);
}

extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarry_u64 (unsigned char __CF, unsigned long long __X,
	       unsigned long long __Y, unsigned long long *__P)
{
  return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
}

extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarryx_u64 (unsigned char __CF, unsigned long long __X,
		unsigned long long __Y, unsigned long long *__P)
{
  return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
}
#endif

#endif /* _ADXINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512FINTRIN_H_INCLUDED
#define _AVX512FINTRIN_H_INCLUDED

#ifndef __AVX512F__
#pragma GCC push_options
#pragma GCC target("avx512f")
#define __DISABLE_AVX512F__
#endif /* __AVX512F__ */

/* Internal data types for implementing the intrinsics.  */
typedef double __v8df __attribute__ ((__vector_size__ (64)));
typedef float __v16sf __attribute__ ((__vector_size__ (64)));
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
typedef int __v16si __attribute__ ((__vector_size__ (64)));
typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
typedef char __v64qi __attribute__ ((__vector_size__ (64)));
typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));

/* Unaligned version of the same type.  */
typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));

typedef unsigned char  __mmask8;
typedef unsigned short __mmask16;

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_int2mask (int __M)
{
  return (__mmask16) __M;
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2int (__mmask16 __M)
{
  return (int) __M;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set_epi64 (long long __A, long long __B, long long __C,
		  long long __D, long long __E, long long __F,
		  long long __G, long long __H)
{
  return __extension__ (__m512i) (__v8di)
	 { __H, __G, __F, __E, __D, __C, __B, __A };
}

/* Create the vector [A B C D E F G H I J K L M N O P].  */
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set_epi32 (int __A, int __B, int __C, int __D,
		  int __E, int __F, int __G, int __H,
		  int __I, int __J, int __K, int __L,
		  int __M, int __N, int __O, int __P)
{
  return __extension__ (__m512i)(__v16si)
	 { __P, __O, __N, __M, __L, __K, __J, __I,
	   __H, __G, __F, __E, __D, __C, __B, __A };
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set_pd (double __A, double __B, double __C, double __D,
	       double __E, double __F, double __G, double __H)
{
  return __extension__ (__m512d)
	 { __H, __G, __F, __E, __D, __C, __B, __A };
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set_ps (float __A, float __B, float __C, float __D,
	       float __E, float __F, float __G, float __H,
	       float __I, float __J, float __K, float __L,
	       float __M, float __N, float __O, float __P)
{
  return __extension__ (__m512)
	 { __P, __O, __N, __M, __L, __K, __J, __I,
	   __H, __G, __F, __E, __D, __C, __B, __A };
}

#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)			      \
  _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0)

#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,			      \
			  e8,e9,e10,e11,e12,e13,e14,e15)		      \
  _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)

#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)				      \
  _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0)

#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
  _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_undefined_ps (void)
{
  __m512 __Y = __Y;
  return __Y;
}

#define _mm512_undefined _mm512_undefined_ps

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_undefined_pd (void)
{
  __m512d __Y = __Y;
  return __Y;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_undefined_epi32 (void)
{
  __m512i __Y = __Y;
  return __Y;
}

#define _mm512_undefined_si512 _mm512_undefined_epi32

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi8 (char __A)
{
  return __extension__ (__m512i)(__v64qi)
	 { __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A };
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi16 (short __A)
{
  return __extension__ (__m512i)(__v32hi)
	 { __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A,
	   __A, __A, __A, __A, __A, __A, __A, __A };
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_pd (double __A)
{
  return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
						  (__v2df) { __A, },
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_ps (float __A)
{
  return (__m512) __builtin_ia32_broadcastss512 (__extension__
						 (__v4sf) { __A, },
						 (__v16sf)
						 _mm512_undefined_ps (),
						 (__mmask16) -1);
}

/* Create the vector [A B C D A B C D A B C D A B C D].  */
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
{
  return __extension__ (__m512i)(__v16si)
	 { __D, __C, __B, __A, __D, __C, __B, __A,
	   __D, __C, __B, __A, __D, __C, __B, __A };
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set4_epi64 (long long __A, long long __B, long long __C,
		   long long __D)
{
  return __extension__ (__m512i) (__v8di)
	 { __D, __C, __B, __A, __D, __C, __B, __A };
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set4_pd (double __A, double __B, double __C, double __D)
{
  return __extension__ (__m512d)
	 { __D, __C, __B, __A, __D, __C, __B, __A };
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set4_ps (float __A, float __B, float __C, float __D)
{
  return __extension__ (__m512)
	 { __D, __C, __B, __A, __D, __C, __B, __A,
	   __D, __C, __B, __A, __D, __C, __B, __A };
}

#define _mm512_setr4_epi64(e0,e1,e2,e3)					      \
  _mm512_set4_epi64(e3,e2,e1,e0)

#define _mm512_setr4_epi32(e0,e1,e2,e3)					      \
  _mm512_set4_epi32(e3,e2,e1,e0)

#define _mm512_setr4_pd(e0,e1,e2,e3)					      \
  _mm512_set4_pd(e3,e2,e1,e0)

#define _mm512_setr4_ps(e0,e1,e2,e3)					      \
  _mm512_set4_ps(e3,e2,e1,e0)

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_setzero_ps (void)
{
  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
				 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_setzero_pd (void)
{
  return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_setzero_epi32 (void)
{
  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_setzero_si512 (void)
{
  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
						  (__v8df) __W,
						  (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
						  (__v8df)
						  _mm512_setzero_pd (),
						  (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
						 (__v16sf) __W,
						 (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
						 (__v16sf)
						 _mm512_setzero_ps (),
						 (__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_load_pd (void const *__P)
{
  return *(__m512d *) __P;
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
						   (__v8df) __W,
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_load_pd (__mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_store_pd (void *__P, __m512d __A)
{
  *(__m512d *) __P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A)
{
  __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A,
				   (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_load_ps (void const *__P)
{
  return *(__m512 *) __P;
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
						  (__v16sf) __W,
						  (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_load_ps (__mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_store_ps (void *__P, __m512 __A)
{
  *(__m512 *) __P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A)
{
  __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A,
				   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
						     (__v8di) __W,
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
						     (__v8di)
						     _mm512_setzero_si512 (),
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_load_epi64 (void const *__P)
{
  return *(__m512i *) __P;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
							(__v8di) __W,
							(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
							(__v8di)
							_mm512_setzero_si512 (),
							(__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_store_epi64 (void *__P, __m512i __A)
{
  *(__m512i *) __P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
{
  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
					(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
						     (__v16si) __W,
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_load_si512 (void const *__P)
{
  return *(__m512i *) __P;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_load_epi32 (void const *__P)
{
  return *(__m512i *) __P;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
							(__v16si) __W,
							(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
							(__v16si)
							_mm512_setzero_si512 (),
							(__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_store_si512 (void *__P, __m512i __A)
{
  *(__m512i *) __P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_store_epi32 (void *__P, __m512i __A)
{
  *(__m512i *) __P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
{
  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
					(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A * (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srav_epi32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v8du) __A + (__v8du) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v8du) __A - (__v8du) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_undefined_pd (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srav_epi64 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
						 (__v8di) __Y,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A + (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_epi32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
						  (__v16si) __Y,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A - (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_epu32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
						   (__v16si) __Y,
						   (__v8di)
						   _mm512_undefined_epi32 (),
						   (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
						   (__v16si) __Y,
						   (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
						   (__v16si) __Y,
						   (__v8di)
						   _mm512_setzero_si512 (),
						   __M);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_slli_epi64 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
			unsigned int __B)
{
  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}
#else
#define _mm512_slli_epi64(X, C)						   \
  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask8)-1))

#define _mm512_mask_slli_epi64(W, U, X, C)				   \
  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_slli_epi64(U, X, C)                                   \
  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_setzero_si512 (),\
    (__mmask8)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sll_epi64 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srli_epi64 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U,
			__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}
#else
#define _mm512_srli_epi64(X, C)						   \
  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask8)-1))

#define _mm512_mask_srli_epi64(W, U, X, C)				   \
  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_srli_epi64(U, X, C)                                   \
  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_setzero_si512 (),\
    (__mmask8)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srl_epi64 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srai_epi64 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
			unsigned int __B)
{
  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}
#else
#define _mm512_srai_epi64(X, C)						   \
  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask8)-1))

#define _mm512_mask_srai_epi64(W, U, X, C)				   \
  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_srai_epi64(U, X, C)				   \
  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
    (__v8di)(__m512i)_mm512_setzero_si512 (),\
    (__mmask8)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sra_epi64 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
						 (__v2di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_slli_epi32 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			unsigned int __B)
{
  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}
#else
#define _mm512_slli_epi32(X, C)						    \
  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask16)-1))

#define _mm512_mask_slli_epi32(W, U, X, C)                                  \
  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_slli_epi32(U, X, C)                                    \
  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_setzero_si512 (),\
    (__mmask16)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sll_epi32 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srli_epi32 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U,
			__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}
#else
#define _mm512_srli_epi32(X, C)						    \
  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask16)-1))

#define _mm512_mask_srli_epi32(W, U, X, C)                                  \
  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_srli_epi32(U, X, C)				    \
  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_setzero_si512 (),\
    (__mmask16)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srl_epi32 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srai_epi32 (__m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			unsigned int __B)
{
  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
{
  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}
#else
#define _mm512_srai_epi32(X, C)						    \
  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask16)-1))

#define _mm512_mask_srai_epi32(W, U, X, C)				    \
  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_srai_epi32(U, X, C)				    \
  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_setzero_si512 (),\
    (__mmask16)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sra_epi32 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
						 (__v4si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

#else
#define _mm_add_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_addsd_round(A, B, C)

#define _mm_mask_add_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C)

#define _mm_maskz_add_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_add_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_addss_round(A, B, C)

#define _mm_mask_add_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C)

#define _mm_maskz_add_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#define _mm_sub_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_subsd_round(A, B, C)

#define _mm_mask_sub_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C)

#define _mm_maskz_sub_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_sub_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_subss_round(A, B, C)

#define _mm_mask_sub_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C)

#define _mm_maskz_sub_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#endif

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C,
			   const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
						     (__v8di) __B,
						     (__v8di) __C, __imm,
						     (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
				__m512i __C, const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
						     (__v8di) __B,
						     (__v8di) __C, __imm,
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
				 __m512i __C, const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di) __C,
						      __imm, (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C,
			   const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si) __C,
						     __imm, (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
				__m512i __C, const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si) __C,
						     __imm, (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
				 __m512i __C, const int __imm)
{
  return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
						      (__v16si) __B,
						      (__v16si) __C,
						      __imm, (__mmask16) __U);
}
#else
#define _mm512_ternarylogic_epi64(A, B, C, I)				\
  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1))
#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I)			\
  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I)			\
  ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A),	\
    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
#define _mm512_ternarylogic_epi32(A, B, C, I)				\
  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
    (__mmask16)-1))
#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I)			\
  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
    (__mmask16)(U)))
#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I)			\
  ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A),	\
    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
    (__mmask16)(U)))
#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp14_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
						   (__v8df)
						   _mm512_undefined_pd (),
						   (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
						   (__v8df) __W,
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp14_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
						  (__v16sf)
						  _mm512_undefined_ps (),
						  (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
						  (__v16sf) __W,
						  (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp14_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B,
					   (__v2df) __A);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
						(__v2df) __A,
						(__v2df) __W,
						(__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
						(__v2df) __A,
						(__v2df) _mm_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp14_ss (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B,
					  (__v4sf) __A);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
						(__v4sf) __A,
						(__v4sf) __W,
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
						(__v4sf) __A,
						(__v4sf) _mm_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt14_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
						     (__v8df)
						     _mm512_undefined_pd (),
						     (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
						     (__v8df) __W,
						     (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
						     (__v8df)
						     _mm512_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt14_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
						    (__v16sf) __W,
						    (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt14_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B,
					     (__v2df) __A);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
						 (__v2df) __A,
						 (__v2df) __W,
						 (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
						 (__v2df) __A,
						 (__v2df) _mm_setzero_pd (),
						 (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt14_ss (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B,
					    (__v4sf) __A);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
						 (__v4sf) __A,
						 (__v4sf) __W,
						 (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
						(__v4sf) __A,
						(__v4sf) _mm_setzero_ps (),
						(__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sqrt_round_pd (__m512d __A, const int __R)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			   const int __R)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df) __W,
						  (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df)
						  _mm512_setzero_pd (),
						  (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sqrt_round_ps (__m512 __A, const int __R)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf)
						 _mm512_undefined_ps (),
						 (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf) __W,
						 (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf)
						 _mm512_setzero_ps (),
						 (__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
						     (__v2df) __A,
						     (__v2df)
						     _mm_setzero_pd (),
						     (__mmask8) -1, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
			const int __R)
{
  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
						     (__v2df) __A,
						     (__v2df) __W,
						     (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
						     (__v2df) __A,
						     (__v2df)
						     _mm_setzero_pd (),
						     (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
						    (__v4sf) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) -1, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
			const int __R)
{
  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
						    (__v4sf) __A,
						    (__v4sf) __W,
						    (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
						    (__v4sf) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) __U, __R);
}
#else
#define _mm512_sqrt_round_pd(A, C)            \
    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_sqrt_round_pd(W, U, A, C) \
    (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C)

#define _mm512_maskz_sqrt_round_pd(U, A, C)   \
    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_sqrt_round_ps(A, C)            \
    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_sqrt_round_ps(W, U, A, C) \
    (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C)

#define _mm512_maskz_sqrt_round_ps(U, A, C)   \
    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm_sqrt_round_sd(A, B, C)	      \
    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
	(__v2df) _mm_setzero_pd (), -1, C)

#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C)

#define _mm_maskz_sqrt_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
	(__v2df) _mm_setzero_pd (), U, C)

#define _mm_sqrt_round_ss(A, B, C)	      \
    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
	(__v4sf) _mm_setzero_ps (), -1, C)

#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C)

#define _mm_maskz_sqrt_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
	(__v4sf) _mm_setzero_ps (), U, C)
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi8_epi32 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi8_epi64 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi16_epi32 (__m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi16_epi64 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi32_epi64 (__m256i __X)
{
  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
{
  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
{
  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu8_epi32 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu8_epi64 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu16_epi32 (__m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu16_epi64 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
{
  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu32_epi64 (__m256i __X)
{
  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
{
  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
{
  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			  __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			   const int __R)
{
  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			  __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			   const int __R)
{
  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}
#else
#define _mm512_add_round_pd(A, B, C)            \
    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_add_round_pd(W, U, A, B, C) \
    (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C)

#define _mm512_maskz_add_round_pd(U, A, B, C)   \
    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_add_round_ps(A, B, C)            \
    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_add_round_ps(W, U, A, B, C) \
    (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C)

#define _mm512_maskz_add_round_ps(U, A, B, C)   \
    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm512_sub_round_pd(A, B, C)            \
    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_sub_round_pd(W, U, A, B, C) \
    (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C)

#define _mm512_maskz_sub_round_pd(U, A, B, C)   \
    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_sub_round_ps(A, B, C)            \
    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_sub_round_ps(W, U, A, B, C) \
    (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C)

#define _mm512_maskz_sub_round_ps(U, A, B, C)   \
    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
#endif

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			  __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			   const int __R)
{
  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R)
{
  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
						 (__v8df) __V,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M,
			  __m512d __V, const int __R)
{
  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
						 (__v8df) __V,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V,
			   const int __R)
{
  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
						 (__v8df) __V,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

#else
#define _mm512_mul_round_pd(A, B, C)            \
    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_mul_round_pd(W, U, A, B, C) \
    (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C)

#define _mm512_maskz_mul_round_pd(U, A, B, C)   \
    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_mul_round_ps(A, B, C)            \
    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_mul_round_ps(W, U, A, B, C) \
    (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C)

#define _mm512_maskz_mul_round_ps(U, A, B, C)   \
    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm512_div_round_pd(A, B, C)            \
    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_div_round_pd(W, U, A, B, C) \
    (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C)

#define _mm512_maskz_div_round_pd(U, A, B, C)   \
    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_div_round_ps(A, B, C)            \
    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_div_round_ps(W, U, A, B, C) \
    (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C)

#define _mm512_maskz_div_round_ps(U, A, B, C)   \
    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm_mul_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_mulsd_round(A, B, C)

#define _mm_mask_mul_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C)

#define _mm_maskz_mul_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_mul_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_mulss_round(A, B, C)

#define _mm_mask_mul_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C)

#define _mm_maskz_mul_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#define _mm_div_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_divsd_round(A, B, C)

#define _mm_mask_div_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C)

#define _mm_maskz_div_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_div_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_divss_round(A, B, C)

#define _mm_mask_div_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C)

#define _mm_maskz_div_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#endif

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			  __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			   const int __R)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			  __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			   const int __R)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			  __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U, __R);
}
#else
#define _mm512_max_round_pd(A, B,  R) \
    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)

#define _mm512_mask_max_round_pd(W, U,  A, B, R) \
    (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R)

#define _mm512_maskz_max_round_pd(U, A,  B, R) \
    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)

#define _mm512_max_round_ps(A, B,  R) \
    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R)

#define _mm512_mask_max_round_ps(W, U,  A, B, R) \
    (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R)

#define _mm512_maskz_max_round_ps(U, A,  B, R) \
    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)

#define _mm512_min_round_pd(A, B,  R) \
    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)

#define _mm512_mask_min_round_pd(W, U,  A, B, R) \
    (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R)

#define _mm512_maskz_min_round_pd(U, A,  B, R) \
    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)

#define _mm512_min_round_ps(A, B, R) \
    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R)

#define _mm512_mask_min_round_ps(W, U,  A, B, R) \
    (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R)

#define _mm512_maskz_min_round_ps(U, A,  B, R) \
    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
#endif

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			     __m512d __B, const int __R)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __W,
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			      const int __R)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			     __m512 __B, const int __R)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __W,
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
			      const int __R)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
						       (__v2df) __B,
						       (__v2df)
						       _mm_setzero_pd (),
						       (__mmask8) -1, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
			  const int __R)
{
  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
						       (__v2df) __B,
						       (__v2df) __W,
						       (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
						       (__v2df) __B,
						       (__v2df)
						       _mm_setzero_pd (),
						       (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
						      (__v4sf) __B,
						      (__v4sf)
						      _mm_setzero_ps (),
						      (__mmask8) -1, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
			 const int __R)
{
  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
						      (__v4sf) __B,
						      (__v4sf) __W,
						      (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
						      (__v4sf) __B,
						      (__v4sf)
						      _mm_setzero_ps (),
						      (__mmask8) __U, __R);
}
#else
#define _mm512_scalef_round_pd(A, B, C)            \
    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)

#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)

#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)

#define _mm512_scalef_round_ps(A, B, C)            \
    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)

#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
    (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C)

#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)

#define _mm_scalef_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, \
	(__v2df)_mm_setzero_pd (), -1, C)

#define _mm_scalef_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_scalefss_mask_round (A, B, \
	(__v4sf)_mm_setzero_ps (), -1, C)
#endif

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			    __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
			     __mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			     __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			    __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
			     __mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
			     __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			    __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
			     __mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			     __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
						     (__v8df) __B,
						     -(__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			    __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
			     __mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
			     __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
						    (__v16sf) __B,
						    -(__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       (__v8df) __C,
						       (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			       __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       (__v8df) __C,
						       (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
				__mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
				__m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16sf) __C,
						      (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			       __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16sf) __C,
						      (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
				__mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
				__m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       -(__v8df) __C,
						       (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			       __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       -(__v8df) __C,
						       (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
				__mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
				__m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
							(__v8df) __B,
							-(__v8df) __C,
							(__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      -(__v16sf) __C,
						      (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			       __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      -(__v16sf) __C,
						      (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
				__mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
				__m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
						       (__v16sf) __B,
						       -(__v16sf) __C,
						       (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			     __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
			      __mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			      __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			     __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
			      __mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
			      __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			     __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
			      __mmask8 __U, const int __R)
{
  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
						      (__v8df) __B,
						      (__v8df) __C,
						      (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
			      __m512d __C, const int __R)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
						     (__v8df) __B,
						     -(__v8df) __C,
						     (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			     __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
			      __mmask16 __U, const int __R)
{
  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16sf) __C,
						     (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
			      __m512 __C, const int __R)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
						    (__v16sf) __B,
						    -(__v16sf) __C,
						    (__mmask16) __U, __R);
}
#else
#define _mm512_fmadd_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R)

#define _mm512_mask_fmadd_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R)

#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R)

#define _mm512_fmadd_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R)

#define _mm512_mask_fmadd_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R)

#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R)

#define _mm512_fmsub_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), -1, R)

#define _mm512_mask_fmsub_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), U, R)

#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, -(C), U, R)

#define _mm512_fmsub_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), -1, R)

#define _mm512_mask_fmsub_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), U, R)

#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, -(C), U, R)

#define _mm512_fmaddsub_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R)

#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R)

#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R)

#define _mm512_fmaddsub_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R)

#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R)

#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R)

#define _mm512_fmsubadd_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R)

#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R)

#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R)

#define _mm512_fmsubadd_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R)

#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R)

#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R)

#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R)

#define _mm512_fnmadd_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, C, -1, R)

#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfnmaddpd512_mask(-(A), B, C, U, R)

#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(A), B, C, U, R)

#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, C, U, R)

#define _mm512_fnmadd_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, C, -1, R)

#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfnmaddps512_mask(-(A), B, C, U, R)

#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfmaddps512_mask3(-(A), B, C, U, R)

#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, C, U, R)

#define _mm512_fnmsub_round_pd(A, B, C, R)            \
    (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, -(C), -1, R)

#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R)    \
    (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R)

#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R)   \
    (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R)

#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R)   \
    (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, -(C), U, R)

#define _mm512_fnmsub_round_ps(A, B, C, R)            \
    (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, -(C), -1, R)

#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R)    \
    (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R)

#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R)   \
    (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R)

#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R)   \
    (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, -(C), U, R)
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_epi64 (__m512i __A)
{
  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_epi32 (__m512i __A)
{
  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastss_ps (__m128 __A)
{
  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
						 (__v16sf)
						 _mm512_undefined_ps (),
						 (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
{
  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
						 (__v16sf) __O, __M);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
{
  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
						 (__v16sf)
						 _mm512_setzero_ps (),
						 __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastsd_pd (__m128d __A)
{
  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
{
  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
						  (__v8df) __O, __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
{
  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
						  (__v8df)
						  _mm512_setzero_pd (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastd_epi32 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
						  (__v16si) __O, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi32 (int __A)
{
  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
							   (__v16si)
							   _mm512_undefined_epi32 (),
							   (__mmask16)(-1));
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
{
  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
							   __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_set1_epi32 (__mmask16 __M, int __A)
{
  return (__m512i)
	 __builtin_ia32_pbroadcastd512_gpr_mask (__A,
						 (__v16si) _mm512_setzero_si512 (),
						 __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastq_epi64 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
						  (__v8di) __O, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi64 (long long __A)
{
  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
							   (__v8di)
							   _mm512_undefined_epi32 (),
							   (__mmask8)(-1));
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
{
  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
							   __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
{
  return (__m512i)
	 __builtin_ia32_pbroadcastq512_gpr_mask (__A,
						 (__v8di) _mm512_setzero_si512 (),
						 __M);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcast_f32x4 (__m128 __A)
{
  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
						     (__v16sf)
						     _mm512_undefined_ps (),
						     (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
{
  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
						     (__v16sf) __O,
						     __M);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
{
  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
						     (__v16sf)
						     _mm512_setzero_ps (),
						     __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcast_i32x4 (__m128i __A)
{
  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
						      (__v16si)
						      _mm512_undefined_epi32 (),
						      (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
						      (__v16si) __O,
						      __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
						      (__v16si)
						      _mm512_setzero_si512 (),
						      __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcast_f64x4 (__m256d __A)
{
  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
						      (__v8df)
						      _mm512_undefined_pd (),
						      (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
{
  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
						      (__v8df) __O,
						      __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
{
  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
						      (__v8df)
						      _mm512_setzero_pd (),
						      __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcast_i64x4 (__m256i __A)
{
  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
						      (__v8di)
						      _mm512_undefined_epi32 (),
						      (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
{
  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
						      (__v8di) __O,
						      __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
{
  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
						      (__v8di)
						      _mm512_setzero_si512 (),
						      __M);
}

typedef enum
{
  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
  _MM_PERM_DDDD = 0xFF
} _MM_PERM_ENUM;

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask)
{
  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
						  __mask,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			   _MM_PERM_ENUM __mask)
{
  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
						  __mask,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask)
{
  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
						  __mask,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
						   (__v8di) __B, __imm,
						   (__v8di)
						   _mm512_undefined_epi32 (),
						   (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A,
			   __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
						   (__v8di) __B, __imm,
						   (__v8di) __W,
						   (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B,
			    const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
						   (__v8di) __B, __imm,
						   (__v8di)
						   _mm512_setzero_si512 (),
						   (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
						   (__v16si) __B,
						   __imm,
						   (__v16si)
						   _mm512_undefined_epi32 (),
						   (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A,
			   __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
						   (__v16si) __B,
						   __imm,
						   (__v16si) __W,
						   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B,
			    const int __imm)
{
  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
						   (__v16si) __B,
						   __imm,
						   (__v16si)
						   _mm512_setzero_si512 (),
						   (__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm)
{
  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
						   (__v8df) __B, __imm,
						   (__v8df)
						   _mm512_undefined_pd (),
						   (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A,
			   __m512d __B, const int __imm)
{
  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
						   (__v8df) __B, __imm,
						   (__v8df) __W,
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B,
			    const int __imm)
{
  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
						   (__v8df) __B, __imm,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm)
{
  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
						  (__v16sf) __B, __imm,
						  (__v16sf)
						  _mm512_undefined_ps (),
						  (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A,
			   __m512 __B, const int __imm)
{
  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
						  (__v16sf) __B, __imm,
						  (__v16sf) __W,
						  (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
			    const int __imm)
{
  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
						  (__v16sf) __B, __imm,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __U);
}

#else
#define _mm512_shuffle_epi32(X, C)                                      \
  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask16)-1))

#define _mm512_mask_shuffle_epi32(W, U, X, C)                           \
  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_shuffle_epi32(U, X, C)                             \
  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
    (__v16si)(__m512i)_mm512_setzero_si512 (),\
    (__mmask16)(U)))

#define _mm512_shuffle_i64x2(X, Y, C)                                   \
  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
      (__v8di)(__m512i)(Y), (int)(C),\
    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask8)-1))

#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C)                        \
  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
      (__v8di)(__m512i)(Y), (int)(C),\
    (__v8di)(__m512i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_shuffle_i64x2(U, X, Y, C)                          \
  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
      (__v8di)(__m512i)(Y), (int)(C),\
    (__v8di)(__m512i)_mm512_setzero_si512 (),\
    (__mmask8)(U)))

#define _mm512_shuffle_i32x4(X, Y, C)                                   \
  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
      (__v16si)(__m512i)(Y), (int)(C),\
    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
    (__mmask16)-1))

#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C)                        \
  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
      (__v16si)(__m512i)(Y), (int)(C),\
    (__v16si)(__m512i)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_shuffle_i32x4(U, X, Y, C)                          \
  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
      (__v16si)(__m512i)(Y), (int)(C),\
    (__v16si)(__m512i)_mm512_setzero_si512 (),\
    (__mmask16)(U)))

#define _mm512_shuffle_f64x2(X, Y, C)                                   \
  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
      (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)_mm512_undefined_pd(),\
    (__mmask8)-1))

#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C)                        \
  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
      (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_shuffle_f64x2(U, X, Y, C)                         \
  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),    \
      (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)_mm512_setzero_pd(),\
    (__mmask8)(U)))

#define _mm512_shuffle_f32x4(X, Y, C)                                  \
  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
      (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)_mm512_undefined_ps(),\
    (__mmask16)-1))

#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C)                       \
  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
      (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_shuffle_f32x4(U, X, Y, C)                         \
  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
      (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)_mm512_setzero_ps(),\
    (__mmask16)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rolv_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rorv_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rolv_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rorv_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_undefined_si256 (),
						     (__mmask8) -1, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
				const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si) __W,
						     (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si)
						      _mm256_undefined_si256 (),
						      (__mmask8) -1, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
				const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si) __W,
						      (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si)
						      _mm256_setzero_si256 (),
						      (__mmask8) __U, __R);
}
#else
#define _mm512_cvtt_roundpd_epi32(A, B)		     \
    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))

#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B)   \
    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B))

#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B)     \
    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))

#define _mm512_cvtt_roundpd_epu32(A, B)		     \
    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))

#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B)   \
    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B))

#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B)     \
    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
#endif

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si)
						    _mm256_undefined_si256 (),
						    (__mmask8) -1, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
			       const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si) __W,
						    (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si)
						    _mm256_setzero_si256 (),
						    (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_undefined_si256 (),
						     (__mmask8) -1, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
			       const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si) __W,
						     (__mmask8) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U, __R);
}
#else
#define _mm512_cvt_roundpd_epi32(A, B)		    \
    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))

#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B)   \
    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B))

#define _mm512_maskz_cvt_roundpd_epi32(U, A, B)     \
    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))

#define _mm512_cvt_roundpd_epu32(A, B)		    \
    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))

#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B)   \
    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B))

#define _mm512_maskz_cvt_roundpd_epu32(U, A, B)     \
    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
				const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si) __W,
						     (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si)
						      _mm512_undefined_epi32 (),
						      (__mmask16) -1, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
				const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si) __W,
						      (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si)
						      _mm512_setzero_si512 (),
						      (__mmask16) __U, __R);
}
#else
#define _mm512_cvtt_roundps_epi32(A, B)		     \
    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))

#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B)   \
    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B))

#define _mm512_maskz_cvtt_roundps_epi32(U, A, B)     \
    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))

#define _mm512_cvtt_roundps_epu32(A, B)		     \
    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))

#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B)   \
    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B))

#define _mm512_maskz_cvtt_roundps_epu32(U, A, B)     \
    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundps_epi32 (__m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
			       const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si) __W,
						    (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundps_epu32 (__m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
			       const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si) __W,
						     (__mmask16) __U, __R);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U, __R);
}
#else
#define _mm512_cvt_roundps_epi32(A, B)		    \
    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))

#define _mm512_mask_cvt_roundps_epi32(W, U, A, B)   \
    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B))

#define _mm512_maskz_cvt_roundps_epi32(U, A, B)     \
    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))

#define _mm512_cvt_roundps_epu32(A, B)		    \
    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))

#define _mm512_mask_cvt_roundps_epu32(W, U, A, B)   \
    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B))

#define _mm512_maskz_cvt_roundps_epu32(U, A, B)     \
    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
#endif

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtu32_sd (__m128d __A, unsigned __B)
{
  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
}

#ifdef __x86_64__
#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R)
{
  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R)
{
  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R)
{
  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
}
#else
#define _mm_cvt_roundu64_sd(A, B, C)   \
    (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C)

#define _mm_cvt_roundi64_sd(A, B, C)   \
    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)

#define _mm_cvt_roundsi64_sd(A, B, C)   \
    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
#endif

#endif

#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
}
#else
#define _mm_cvt_roundu32_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtusi2ss32(A, B, C)

#define _mm_cvt_roundi32_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)

#define _mm_cvt_roundsi32_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
#endif

#ifdef __x86_64__
#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
}
#else
#define _mm_cvt_roundu64_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtusi2ss64(A, B, C)

#define _mm_cvt_roundi64_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)

#define _mm_cvt_roundsi64_ss(A, B, C)   \
    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
#endif

#endif

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi32_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
						  (__v16qi)
						  _mm_undefined_si128 (),
						  (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
						  (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi32_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
						   (__v16qi)
						   _mm_undefined_si128 (),
						   (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
						   (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi32_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
						    (__v16qi)
						    _mm_undefined_si128 (),
						    (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
						    (__v16qi) __O,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi32_epi16 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
						  (__v16hi)
						  _mm256_undefined_si256 (),
						  (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
						  (__v16hi) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi32_epi16 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
						   (__v16hi)
						   _mm256_undefined_si256 (),
						   (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
						   (__v16hi) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi32_epi16 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
						    (__v16hi)
						    _mm256_undefined_si256 (),
						    (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
{
  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
						    (__v16hi) __O,
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi64_epi32 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
						  (__v8si)
						  _mm256_undefined_si256 (),
						  (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
						  (__v8si) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
						  (__v8si)
						  _mm256_setzero_si256 (),
						  __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi64_epi32 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
						   (__v8si)
						   _mm256_undefined_si256 (),
						   (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
						   (__v8si) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
						   (__v8si)
						   _mm256_setzero_si256 (),
						   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi64_epi32 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
						    (__v8si)
						    _mm256_undefined_si256 (),
						    (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
						    (__v8si) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
						    (__v8si)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi64_epi16 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
						  (__v8hi)
						  _mm_undefined_si128 (),
						  (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
						  (__v8hi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi64_epi16 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
						   (__v8hi)
						   _mm_undefined_si128 (),
						   (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
						   (__v8hi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi64_epi16 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
						    (__v8hi)
						    _mm_undefined_si128 (),
						    (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
						    (__v8hi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi64_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
						  (__v16qi)
						  _mm_undefined_si128 (),
						  (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
						  (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi64_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
						   (__v16qi)
						   _mm_undefined_si128 (),
						   (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
						   (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi64_epi8 (__m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
						    (__v16qi)
						    _mm_undefined_si128 (),
						    (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
{
  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
						    (__v16qi) __O,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
{
  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi32_pd (__m256i __A)
{
  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
						    (__v8df) __W,
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
{
  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu32_pd (__m256i __A)
{
  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
						     (__v8df)
						     _mm512_undefined_pd (),
						     (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
						     (__v8df) __W,
						     (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
{
  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
						     (__v8df)
						     _mm512_setzero_pd (),
						     (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundepi32_ps (__m512i __A, const int __R)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A,
			       const int __R)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf) __W,
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundepu32_ps (__m512i __A, const int __R)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A,
			       const int __R)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf) __W,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U, __R);
}

#else
#define _mm512_cvt_roundepi32_ps(A, B)        \
    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)

#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B)   \
    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B)

#define _mm512_maskz_cvt_roundepi32_ps(U, A, B)      \
    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)

#define _mm512_cvt_roundepu32_ps(A, B)        \
    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)

#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B)   \
    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B)

#define _mm512_maskz_cvt_roundepu32_ps(U, A, B)      \
    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
#endif

#ifdef __OPTIMIZE__
extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_extractf64x4_pd (__m512d __A, const int __imm)
{
  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
						     __imm,
						     (__v4df)
						     _mm256_undefined_pd (),
						     (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A,
			     const int __imm)
{
  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
						     __imm,
						     (__v4df) __W,
						     (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm)
{
  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
						     __imm,
						     (__v4df)
						     _mm256_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_extractf32x4_ps (__m512 __A, const int __imm)
{
  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
						    __imm,
						    (__v4sf)
						    _mm_undefined_ps (),
						    (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A,
			     const int __imm)
{
  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
						    __imm,
						    (__v4sf) __W,
						    (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm)
{
  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
						    __imm,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_extracti64x4_epi64 (__m512i __A, const int __imm)
{
  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
						     __imm,
						     (__v4di)
						     _mm256_undefined_si256 (),
						     (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A,
				const int __imm)
{
  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
						     __imm,
						     (__v4di) __W,
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm)
{
  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
						     __imm,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_extracti32x4_epi32 (__m512i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
						     __imm,
						     (__v4si)
						     _mm_undefined_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A,
				const int __imm)
{
  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
						     __imm,
						     (__v4si) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
						     __imm,
						     (__v4si)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}
#else

#define _mm512_extractf64x4_pd(X, C)                                    \
  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
    (int) (C),\
    (__v4df)(__m256d)_mm256_undefined_pd(),\
    (__mmask8)-1))

#define _mm512_mask_extractf64x4_pd(W, U, X, C)                         \
  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
    (int) (C),\
    (__v4df)(__m256d)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_extractf64x4_pd(U, X, C)                           \
  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
    (int) (C),\
    (__v4df)(__m256d)_mm256_setzero_pd(),\
    (__mmask8)(U)))

#define _mm512_extractf32x4_ps(X, C)                                    \
  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
    (int) (C),\
    (__v4sf)(__m128)_mm_undefined_ps(),\
    (__mmask8)-1))

#define _mm512_mask_extractf32x4_ps(W, U, X, C)                         \
  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
    (int) (C),\
    (__v4sf)(__m128)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_extractf32x4_ps(U, X, C)                           \
  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
    (int) (C),\
    (__v4sf)(__m128)_mm_setzero_ps(),\
    (__mmask8)(U)))

#define _mm512_extracti64x4_epi64(X, C)                                 \
  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
    (int) (C),\
    (__v4di)(__m256i)_mm256_undefined_si256 (),\
    (__mmask8)-1))

#define _mm512_mask_extracti64x4_epi64(W, U, X, C)                      \
  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
    (int) (C),\
    (__v4di)(__m256i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_extracti64x4_epi64(U, X, C)                        \
  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
    (int) (C),\
    (__v4di)(__m256i)_mm256_setzero_si256 (),\
    (__mmask8)(U)))

#define _mm512_extracti32x4_epi32(X, C)                                 \
  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
    (int) (C),\
    (__v4si)(__m128i)_mm_undefined_si128 (),\
    (__mmask8)-1))

#define _mm512_mask_extracti32x4_epi32(W, U, X, C)                      \
  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
    (int) (C),\
    (__v4si)(__m128i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_extracti32x4_epi32(U, X, C)                        \
  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
    (int) (C),\
    (__v4si)(__m128i)_mm_setzero_si128 (),\
    (__mmask8)(U)))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A,
						    (__v4si) __B,
						    __imm,
						    (__v16si) __A, -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm)
{
  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A,
						   (__v4sf) __B,
						   __imm,
						   (__v16sf) __A, -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
						    (__v4di) __B,
						    __imm,
						    (__v8di)
						    _mm512_undefined_epi32 (),
						    (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A,
			 __m256i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
						    (__v4di) __B,
						    __imm,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B,
			  const int __imm)
{
  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
						    (__v4di) __B,
						    __imm,
						    (__v8di)
						    _mm512_setzero_si512 (),
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm)
{
  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
						    (__v4df) __B,
						    __imm,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A,
			 __m256d __B, const int __imm)
{
  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
						    (__v4df) __B,
						    __imm,
						    (__v8df) __W,
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B,
			  const int __imm)
{
  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
						    (__v4df) __B,
						    __imm,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U);
}
#else
#define _mm512_insertf32x4(X, Y, C)                                     \
  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1)))

#define _mm512_inserti32x4(X, Y, C)                                     \
  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1)))

#define _mm512_insertf64x4(X, Y, C)                                     \
  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
    (__v4df)(__m256d) (Y), (int) (C),					\
    (__v8df)(__m512d)_mm512_undefined_pd(),				\
    (__mmask8)-1))

#define _mm512_mask_insertf64x4(W, U, X, Y, C)                          \
  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
    (__v4df)(__m256d) (Y), (int) (C),					\
    (__v8df)(__m512d)(W),						\
    (__mmask8)(U)))

#define _mm512_maskz_insertf64x4(U, X, Y, C)                            \
  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
    (__v4df)(__m256d) (Y), (int) (C),					\
    (__v8df)(__m512d)_mm512_setzero_pd(),				\
    (__mmask8)(U)))

#define _mm512_inserti64x4(X, Y, C)                                     \
  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
    (__v4di)(__m256i) (Y), (int) (C),					\
    (__v8di)(__m512i)_mm512_undefined_epi32 (),				\
    (__mmask8)-1))

#define _mm512_mask_inserti64x4(W, U, X, Y, C)                          \
  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
    (__v4di)(__m256i) (Y), (int) (C),\
    (__v8di)(__m512i)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_inserti64x4(U, X, Y, C)                            \
  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
    (__v4di)(__m256i) (Y), (int) (C),					\
    (__v8di)(__m512i)_mm512_setzero_si512 (),				\
    (__mmask8)(U)))
#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_pd (void const *__P)
{
  return *(__m512d_u *)__P;
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
						   (__v8df) __W,
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_pd (void *__P, __m512d __A)
{
  *(__m512d_u *)__P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A)
{
  __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A,
				   (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_ps (void const *__P)
{
  return *(__m512_u *)__P;
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
						  (__v16sf) __W,
						  (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_ps (void *__P, __m512 __A)
{
  *(__m512_u *)__P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A)
{
  __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A,
				   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
						     (__v8di) __W,
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
						     (__v8di)
						     _mm512_setzero_si512 (),
						     (__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A)
{
  __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A,
				     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_si512 (void const *__P)
{
  return *(__m512i_u *)__P;
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
						     (__v16si) __W,
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_si512 (void *__P, __m512i __A)
{
  *(__m512i_u *)__P = __A;
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A)
{
  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
				     (__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutevar_pd (__m512d __A, __m512i __C)
{
  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
							(__v8di) __C,
							(__v8df)
							_mm512_undefined_pd (),
							(__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
{
  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
							(__v8di) __C,
							(__v8df) __W,
							(__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
{
  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
							(__v8di) __C,
							(__v8df)
							_mm512_setzero_pd (),
							(__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutevar_ps (__m512 __A, __m512i __C)
{
  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
						       (__v16si) __C,
						       (__v16sf)
						       _mm512_undefined_ps (),
						       (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
{
  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
						       (__v16si) __C,
						       (__v16sf) __W,
						       (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
{
  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
						       (__v16si) __C,
						       (__v16sf)
						       _mm512_setzero_ps (),
						       (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
						       /* idx */ ,
						       (__v8di) __A,
						       (__v8di) __B,
						       (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
				__m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
						       /* idx */ ,
						       (__v8di) __A,
						       (__v8di) __B,
						       (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
				 __mmask8 __U, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
						       (__v8di) __I
						       /* idx */ ,
						       (__v8di) __B,
						       (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
				 __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
							/* idx */ ,
							(__v8di) __A,
							(__v8di) __B,
							(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
						       /* idx */ ,
						       (__v16si) __A,
						       (__v16si) __B,
						       (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
				__m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
						       /* idx */ ,
						       (__v16si) __A,
						       (__v16si) __B,
						       (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
				 __mmask16 __U, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
						       (__v16si) __I
						       /* idx */ ,
						       (__v16si) __B,
						       (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
				 __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
							/* idx */ ,
							(__v16si) __A,
							(__v16si) __B,
							(__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
{
  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
							/* idx */ ,
							(__v8df) __A,
							(__v8df) __B,
							(__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I,
			     __m512d __B)
{
  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
							/* idx */ ,
							(__v8df) __A,
							(__v8df) __B,
							(__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
			      __m512d __B)
{
  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
							(__v8di) __I
							/* idx */ ,
							(__v8df) __B,
							(__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
			      __m512d __B)
{
  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
							 /* idx */ ,
							 (__v8df) __A,
							 (__v8df) __B,
							 (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
{
  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
						       /* idx */ ,
						       (__v16sf) __A,
						       (__v16sf) __B,
						       (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
{
  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
						       /* idx */ ,
						       (__v16sf) __A,
						       (__v16sf) __B,
						       (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
			      __m512 __B)
{
  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
						       (__v16si) __I
						       /* idx */ ,
						       (__v16sf) __B,
						       (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
			      __m512 __B)
{
  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
							/* idx */ ,
							(__v16sf) __A,
							(__v16sf) __B,
							(__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permute_pd (__m512d __X, const int __C)
{
  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
						     (__v8df)
						     _mm512_undefined_pd (),
						     (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C)
{
  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
						     (__v8df) __W,
						     (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C)
{
  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
						     (__v8df)
						     _mm512_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permute_ps (__m512 __X, const int __C)
{
  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C)
{
  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
						    (__v16sf) __W,
						    (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C)
{
  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U);
}
#else
#define _mm512_permute_pd(X, C)							    \
  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
					      (__v8df)(__m512d)_mm512_undefined_pd(),\
					      (__mmask8)(-1)))

#define _mm512_mask_permute_pd(W, U, X, C)					    \
  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
					      (__v8df)(__m512d)(W),		    \
					      (__mmask8)(U)))

#define _mm512_maskz_permute_pd(U, X, C)					    \
  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
					      (__v8df)(__m512d)_mm512_setzero_pd(), \
					      (__mmask8)(U)))

#define _mm512_permute_ps(X, C)							    \
  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
					      (__v16sf)(__m512)_mm512_undefined_ps(),\
					      (__mmask16)(-1)))

#define _mm512_mask_permute_ps(W, U, X, C)					    \
  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
					      (__v16sf)(__m512)(W),		    \
					      (__mmask16)(U)))

#define _mm512_maskz_permute_ps(U, X, C)					    \
  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
					      (__v16sf)(__m512)_mm512_setzero_ps(), \
					      (__mmask16)(U)))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex_epi64 (__m512i __X, const int __I)
{
  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) (-1));
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M,
			    __m512i __X, const int __I)
{
  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
						  (__v8di) __W,
						  (__mmask8) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I)
{
  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex_pd (__m512d __X, const int __M)
{
  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M)
{
  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
						  (__v8df) __W,
						  (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M)
{
  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
						  (__v8df)
						  _mm512_setzero_pd (),
						  (__mmask8) __U);
}
#else
#define _mm512_permutex_pd(X, M)						\
  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
					    (__v8df)(__m512d)_mm512_undefined_pd(),\
					    (__mmask8)-1))

#define _mm512_mask_permutex_pd(W, U, X, M)					\
  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
					    (__v8df)(__m512d)(W), (__mmask8)(U)))

#define _mm512_maskz_permutex_pd(U, X, M)					\
  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
					    (__v8df)(__m512d)_mm512_setzero_pd(),\
					    (__mmask8)(U)))

#define _mm512_permutex_epi64(X, I)			          \
  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
					    (int)(I),             \
					    (__v8di)(__m512i)	  \
					    (_mm512_undefined_epi32 ()),\
					    (__mmask8)(-1)))

#define _mm512_maskz_permutex_epi64(M, X, I)                 \
  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
					    (int)(I),             \
					    (__v8di)(__m512i)     \
					    (_mm512_setzero_si512 ()),\
					    (__mmask8)(M)))

#define _mm512_mask_permutex_epi64(W, M, X, I)               \
  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
					    (int)(I),             \
					    (__v8di)(__m512i)(W), \
					    (__mmask8)(M)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
						     (__v8di) __X,
						     (__v8di)
						     _mm512_setzero_si512 (),
						     __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
						     (__v8di) __X,
						     (__v8di)
						     _mm512_undefined_epi32 (),
						     (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
			       __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
						     (__v8di) __X,
						     (__v8di) __W,
						     __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
						     (__v16si) __X,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
						     (__v16si) __X,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
			       __m512i __Y)
{
  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
						     (__v16si) __X,
						     (__v16si) __W,
						     __M);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
{
  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
						     (__v8di) __X,
						     (__v8df)
						     _mm512_undefined_pd (),
						     (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
{
  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
						     (__v8di) __X,
						     (__v8df) __W,
						     (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
{
  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
						     (__v8di) __X,
						     (__v8df)
						     _mm512_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
{
  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
						    (__v16si) __X,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
{
  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
						    (__v16si) __X,
						    (__v16sf) __W,
						    (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
{
  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
						    (__v16si) __X,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm)
{
  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
						 (__v16sf) __V, __imm,
						 (__v16sf)
						 _mm512_undefined_ps (),
						 (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M,
			__m512 __V, const int __imm)
{
  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
						 (__v16sf) __V, __imm,
						 (__v16sf) __W,
						 (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm)
{
  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
						 (__v16sf) __V, __imm,
						 (__v16sf)
						 _mm512_setzero_ps (),
						 (__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm)
{
  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
						  (__v8df) __V, __imm,
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M,
			__m512d __V, const int __imm)
{
  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
						  (__v8df) __V, __imm,
						  (__v8df) __W,
						  (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V,
			 const int __imm)
{
  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
						  (__v8df) __V, __imm,
						  (__v8df)
						  _mm512_setzero_pd (),
						  (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C,
			  const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
						      (__v8df) __B,
						      (__v8di) __C,
						      __imm,
						      (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
			       __m512i __C, const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
						      (__v8df) __B,
						      (__v8di) __C,
						      __imm,
						      (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
				__m512i __C, const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
						       (__v8df) __B,
						       (__v8di) __C,
						       __imm,
						       (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C,
			  const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16si) __C,
						     __imm,
						     (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
			       __m512i __C, const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16si) __C,
						     __imm,
						     (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
				__m512i __C, const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16si) __C,
						      __imm,
						      (__mmask16) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C,
		       const int __imm, const int __R)
{
  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
						   (__v2df) __B,
						   (__v2di) __C, __imm,
						   (__mmask8) -1, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B,
			    __m128i __C, const int __imm, const int __R)
{
  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
						   (__v2df) __B,
						   (__v2di) __C, __imm,
						   (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			     __m128i __C, const int __imm, const int __R)
{
  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
						    (__v2df) __B,
						    (__v2di) __C,
						    __imm,
						    (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C,
		       const int __imm, const int __R)
{
  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
						  (__v4sf) __B,
						  (__v4si) __C, __imm,
						  (__mmask8) -1, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B,
			    __m128i __C, const int __imm, const int __R)
{
  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
						  (__v4sf) __B,
						  (__v4si) __C, __imm,
						  (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			     __m128i __C, const int __imm, const int __R)
{
  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
						   (__v4sf) __B,
						   (__v4si) __C, __imm,
						   (__mmask8) __U, __R);
}

#else
#define _mm512_shuffle_pd(X, Y, C)                                      \
    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
        (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)_mm512_undefined_pd(),\
    (__mmask8)-1))

#define _mm512_mask_shuffle_pd(W, U, X, Y, C)                           \
    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
        (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)(W),\
    (__mmask8)(U)))

#define _mm512_maskz_shuffle_pd(U, X, Y, C)                             \
    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
        (__v8df)(__m512d)(Y), (int)(C),\
    (__v8df)(__m512d)_mm512_setzero_pd(),\
    (__mmask8)(U)))

#define _mm512_shuffle_ps(X, Y, C)                                      \
    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
        (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)_mm512_undefined_ps(),\
    (__mmask16)-1))

#define _mm512_mask_shuffle_ps(W, U, X, Y, C)                           \
    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
        (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)(W),\
    (__mmask16)(U)))

#define _mm512_maskz_shuffle_ps(U, X, Y, C)                             \
    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
        (__v16sf)(__m512)(Y), (int)(C),\
    (__v16sf)(__m512)_mm512_setzero_ps(),\
    (__mmask16)(U)))

#define _mm512_fixupimm_round_pd(X, Y, Z, C, R)					\
  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),	\
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),		\
      (__mmask8)(-1), (R)))

#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R)                          \
  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
      (__mmask8)(U), (R)))

#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R)                         \
  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
      (__mmask8)(U), (R)))

#define _mm512_fixupimm_round_ps(X, Y, Z, C, R)					\
  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),	\
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),		\
    (__mmask16)(-1), (R)))

#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R)                          \
  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
    (__mmask16)(U), (R)))

#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R)                         \
  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
    (__mmask16)(U), (R)))

#define _mm_fixupimm_round_sd(X, Y, Z, C, R)					\
    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(-1), (R)))

#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R)				\
    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), (R)))

#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R)				\
    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), (R)))

#define _mm_fixupimm_round_ss(X, Y, Z, C, R)					\
    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(-1), (R)))

#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R)				\
    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), (R)))

#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R)				\
    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), (R)))
#endif

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movehdup_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
						   (__v16sf) __W,
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_moveldup_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
						   (__v16sf) __W,
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_si512 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A | (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A | (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
						(__v16si) __B,
						(__v16si) __W,
						(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
						(__v16si) __B,
						(__v16si)
						_mm512_setzero_si512 (),
						(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v8du) __A | (__v8du) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
						(__v8di) __B,
						(__v8di) __W,
						(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
						(__v8di) __B,
						(__v8di)
						_mm512_setzero_si512 (),
						(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_si512 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A ^ (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A ^ (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v8du) __A ^ (__v8du) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rol_epi32 (__m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ror_epi32 (__m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
						 (__v16si)
						 _mm512_undefined_epi32 (),
						 (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rol_epi64 (__m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ror_epi64 (__m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
						 (__v8di)
						 _mm512_undefined_epi32 (),
						 (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
						 (__v8di) __W,
						 (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
{
  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
						 (__v8di)
						 _mm512_setzero_si512 (),
						 (__mmask8) __U);
}

#else
#define _mm512_rol_epi32(A, B)						  \
    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)_mm512_undefined_epi32 (), \
					    (__mmask16)(-1)))
#define _mm512_mask_rol_epi32(W, U, A, B)				  \
    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)(__m512i)(W),	  \
					    (__mmask16)(U)))
#define _mm512_maskz_rol_epi32(U, A, B)					  \
    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)_mm512_setzero_si512 (), \
					    (__mmask16)(U)))
#define _mm512_ror_epi32(A, B)						  \
    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)_mm512_undefined_epi32 (), \
					    (__mmask16)(-1)))
#define _mm512_mask_ror_epi32(W, U, A, B)				  \
    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)(__m512i)(W),	  \
					    (__mmask16)(U)))
#define _mm512_maskz_ror_epi32(U, A, B)					  \
    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v16si)_mm512_setzero_si512 (), \
					    (__mmask16)(U)))
#define _mm512_rol_epi64(A, B)						  \
    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)_mm512_undefined_epi32 (),  \
					    (__mmask8)(-1)))
#define _mm512_mask_rol_epi64(W, U, A, B)				  \
    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)(__m512i)(W),	  \
					    (__mmask8)(U)))
#define _mm512_maskz_rol_epi64(U, A, B)					  \
    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)_mm512_setzero_si512 (),  \
					    (__mmask8)(U)))

#define _mm512_ror_epi64(A, B)						  \
    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)_mm512_undefined_epi32 (),  \
					    (__mmask8)(-1)))
#define _mm512_mask_ror_epi64(W, U, A, B)				  \
    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)(__m512i)(W),	  \
					    (__mmask8)(U)))
#define _mm512_maskz_ror_epi64(U, A, B)					  \
    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
					    (int)(B),			  \
					    (__v8di)_mm512_setzero_si512 (),  \
					    (__mmask8)(U)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_si512 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A & (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v16su) __A & (__v16su) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si) __W,
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
						 (__v16si) __B,
						 (__v16si)
						 _mm512_setzero_si512 (),
						 (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v8du) __A & (__v8du) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di) __W, __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
						 (__v8di) __B,
						 (__v8di)
						 _mm512_setzero_pd (),
						 __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_andnot_si512 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_andnot_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_andnot_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W, __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_pd (),
						  __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_test_epi32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
						(__v16si) __B,
						(__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
						(__v16si) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_test_epi64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
					       (__v8di) __B,
					       (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
						 (__v16si) __B,
						 (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
						 (__v16si) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
						(__v8di) __B,
						(__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
						(__v8di) __B, __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_ps (__m512 __A)
{
  return (__m512) _mm512_and_epi32 ((__m512i) __A,
				    _mm512_set1_epi32 (0x7fffffff));
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A,
					 _mm512_set1_epi32 (0x7fffffff));
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_pd (__m512d __A)
{
  return (__m512d) _mm512_and_epi64 ((__m512i) __A,
				     _mm512_set1_epi64 (0x7fffffffffffffffLL));
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d)
	 _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A,
				_mm512_set1_epi64 (0x7fffffffffffffffLL));
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			    __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si) __W,
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di)
						      _mm512_undefined_epi32 (),
						      (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di) __W,
						      (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di)
						      _mm512_setzero_si512 (),
						      (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			    __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si) __W,
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di)
						      _mm512_undefined_epi32 (),
						      (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di) __W,
						      (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
						      (__v8di) __B,
						      (__v8di)
						      _mm512_setzero_si512 (),
						      (__mmask8) __U);
}

#ifdef __x86_64__
#ifdef __OPTIMIZE__
extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_u64 (__m128 __A, const int __R)
{
  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_si64 (__m128 __A, const int __R)
{
  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_i64 (__m128 __A, const int __R)
{
  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_u64 (__m128 __A, const int __R)
{
  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_i64 (__m128 __A, const int __R)
{
  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_si64 (__m128 __A, const int __R)
{
  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
}
#else
#define _mm_cvt_roundss_u64(A, B)   \
    ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B))

#define _mm_cvt_roundss_si64(A, B)   \
    ((long long)__builtin_ia32_vcvtss2si64(A, B))

#define _mm_cvt_roundss_i64(A, B)   \
    ((long long)__builtin_ia32_vcvtss2si64(A, B))

#define _mm_cvtt_roundss_u64(A, B)  \
    ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B))

#define _mm_cvtt_roundss_i64(A, B)  \
    ((long long)__builtin_ia32_vcvttss2si64(A, B))

#define _mm_cvtt_roundss_si64(A, B)  \
    ((long long)__builtin_ia32_vcvttss2si64(A, B))
#endif
#endif

#ifdef __OPTIMIZE__
extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_u32 (__m128 __A, const int __R)
{
  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_si32 (__m128 __A, const int __R)
{
  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_i32 (__m128 __A, const int __R)
{
  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
}

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_u32 (__m128 __A, const int __R)
{
  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_i32 (__m128 __A, const int __R)
{
  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundss_si32 (__m128 __A, const int __R)
{
  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
}
#else
#define _mm_cvt_roundss_u32(A, B)   \
    ((unsigned)__builtin_ia32_vcvtss2usi32(A, B))

#define _mm_cvt_roundss_si32(A, B)   \
    ((int)__builtin_ia32_vcvtss2si32(A, B))

#define _mm_cvt_roundss_i32(A, B)   \
    ((int)__builtin_ia32_vcvtss2si32(A, B))

#define _mm_cvtt_roundss_u32(A, B)  \
    ((unsigned)__builtin_ia32_vcvttss2usi32(A, B))

#define _mm_cvtt_roundss_si32(A, B)  \
    ((int)__builtin_ia32_vcvttss2si32(A, B))

#define _mm_cvtt_roundss_i32(A, B)  \
    ((int)__builtin_ia32_vcvttss2si32(A, B))
#endif

#ifdef __x86_64__
#ifdef __OPTIMIZE__
extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_u64 (__m128d __A, const int __R)
{
  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_si64 (__m128d __A, const int __R)
{
  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_i64 (__m128d __A, const int __R)
{
  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_u64 (__m128d __A, const int __R)
{
  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_si64 (__m128d __A, const int __R)
{
  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_i64 (__m128d __A, const int __R)
{
  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
}
#else
#define _mm_cvt_roundsd_u64(A, B)   \
    ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B))

#define _mm_cvt_roundsd_si64(A, B)   \
    ((long long)__builtin_ia32_vcvtsd2si64(A, B))

#define _mm_cvt_roundsd_i64(A, B)   \
    ((long long)__builtin_ia32_vcvtsd2si64(A, B))

#define _mm_cvtt_roundsd_u64(A, B)   \
    ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B))

#define _mm_cvtt_roundsd_si64(A, B)   \
    ((long long)__builtin_ia32_vcvttsd2si64(A, B))

#define _mm_cvtt_roundsd_i64(A, B)   \
    ((long long)__builtin_ia32_vcvttsd2si64(A, B))
#endif
#endif

#ifdef __OPTIMIZE__
extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_u32 (__m128d __A, const int __R)
{
  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_si32 (__m128d __A, const int __R)
{
  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_i32 (__m128d __A, const int __R)
{
  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
}

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_u32 (__m128d __A, const int __R)
{
  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_i32 (__m128d __A, const int __R)
{
  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_roundsd_si32 (__m128d __A, const int __R)
{
  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
}
#else
#define _mm_cvt_roundsd_u32(A, B)   \
    ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B))

#define _mm_cvt_roundsd_si32(A, B)   \
    ((int)__builtin_ia32_vcvtsd2si32(A, B))

#define _mm_cvt_roundsd_i32(A, B)   \
    ((int)__builtin_ia32_vcvtsd2si32(A, B))

#define _mm_cvtt_roundsd_u32(A, B)   \
    ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B))

#define _mm_cvtt_roundsd_si32(A, B)   \
    ((int)__builtin_ia32_vcvttsd2si32(A, B))

#define _mm_cvtt_roundsd_i32(A, B)   \
    ((int)__builtin_ia32_vcvttsd2si32(A, B))
#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movedup_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
						   (__v8df)
						   _mm512_undefined_pd (),
						   (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
						   (__v8df) __W,
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_pd (__m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __W,
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_pd (__m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __W,
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_ps (__m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __W,
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundps_pd (__m256 __A, const int __R)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A,
			    const int __R)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df) __W,
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundph_ps (__m256i __A, const int __R)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A,
			    const int __R)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf) __W,
						    (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U, __R);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundps_ph (__m512 __A, const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi)
						     _mm256_undefined_si256 (),
						     -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtps_ph (__m512 __A, const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi)
						     _mm256_undefined_si256 (),
						     -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A,
			    const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi) __U,
						     (__mmask16) __W);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi) __U,
						     (__mmask16) __W);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __W);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I)
{
  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
						     __I,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __W);
}
#else
#define _mm512_cvt_roundps_pd(A, B)		 \
    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B)

#define _mm512_mask_cvt_roundps_pd(W, U, A, B)   \
    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B)

#define _mm512_maskz_cvt_roundps_pd(U, A, B)     \
    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B)

#define _mm512_cvt_roundph_ps(A, B)		 \
    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B)

#define _mm512_mask_cvt_roundph_ps(W, U, A, B)   \
    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B)

#define _mm512_maskz_cvt_roundph_ps(U, A, B)     \
    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B)

#define _mm512_cvt_roundps_ph(A, I)						 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)_mm256_undefined_si256 (), -1))
#define _mm512_cvtps_ph(A, I)						 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)_mm256_undefined_si256 (), -1))
#define _mm512_mask_cvt_roundps_ph(U, W, A, I)				 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)(__m256i)(U), (__mmask16) (W)))
#define _mm512_mask_cvtps_ph(U, W, A, I)				 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)(__m256i)(U), (__mmask16) (W)))
#define _mm512_maskz_cvt_roundps_ph(W, A, I)					 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
#define _mm512_maskz_cvtps_ph(W, A, I)					 \
  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
#endif

#ifdef __OPTIMIZE__
extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvt_roundpd_ps (__m512d __A, const int __R)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf)
						   _mm256_undefined_ps (),
						   (__mmask8) -1, __R);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A,
			    const int __R)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf) __W,
						   (__mmask8) __U, __R);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf)
						   _mm256_setzero_ps (),
						   (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R)
{
  return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A,
						 (__v2df) __B,
						 __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R)
{
  return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A,
						  (__v4sf) __B,
						  __R);
}
#else
#define _mm512_cvt_roundpd_ps(A, B)		 \
    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B)

#define _mm512_mask_cvt_roundpd_ps(W, U, A, B)   \
    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B)

#define _mm512_maskz_cvt_roundpd_ps(U, A, B)     \
    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B)

#define _mm_cvt_roundsd_ss(A, B, C)		 \
    (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C)

#define _mm_cvt_roundss_sd(A, B, C)		 \
    (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C)
#endif

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_stream_si512 (__m512i * __P, __m512i __A)
{
  __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_stream_ps (float *__P, __m512 __A)
{
  __builtin_ia32_movntps512 (__P, (__v16sf) __A);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_stream_pd (double *__P, __m512d __A)
{
  __builtin_ia32_movntpd512 (__P, (__v8df) __A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_stream_load_si512 (void *__P)
{
  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
}

/* Constants for mantissa extraction */
typedef enum
{
  _MM_MANT_NORM_1_2,		/* interval [1, 2)      */
  _MM_MANT_NORM_p5_2,		/* interval [0.5, 2)    */
  _MM_MANT_NORM_p5_1,		/* interval [0.5, 1)    */
  _MM_MANT_NORM_p75_1p5		/* interval [0.75, 1.5) */
} _MM_MANTISSA_NORM_ENUM;

typedef enum
{
  _MM_MANT_SIGN_src,		/* sign = sign(SRC)     */
  _MM_MANT_SIGN_zero,		/* sign = 0             */
  _MM_MANT_SIGN_nan		/* DEST = NaN if sign(SRC) = 1 */
} _MM_MANTISSA_SIGN_ENUM;

#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
						    (__v4sf) __B,
						    __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
						     (__v2df) __B,
						     __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getexp_round_ps (__m512 __A, const int __R)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			     const int __R)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf) __W,
						   (__mmask16) __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getexp_round_pd (__m512d __A, const int __R)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			     const int __R)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df) __W,
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
			 _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     _mm512_undefined_pd (),
						     (__mmask8) -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
			      _MM_MANTISSA_NORM_ENUM __B,
			      _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     (__v8df) __W, __U,
						     __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A,
			       _MM_MANTISSA_NORM_ENUM __B,
			       _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     (__v8df)
						     _mm512_setzero_pd (),
						     __U, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
			 _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    _mm512_undefined_ps (),
						    (__mmask16) -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
			      _MM_MANTISSA_NORM_ENUM __B,
			      _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    (__v16sf) __W, __U,
						    __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A,
			       _MM_MANTISSA_NORM_ENUM __B,
			       _MM_MANTISSA_SIGN_ENUM __C, const int __R)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getmant_round_sd (__m128d __A, __m128d __B,
		      _MM_MANTISSA_NORM_ENUM __C,
		      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
						  (__v2df) __B,
						  (__D << 2) | __C,
						   __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			      __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
			      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
						    (__v2df) __B,
						    (__D << 2) | __C,
                                                    (__v2df) __W,
						     __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			       _MM_MANTISSA_NORM_ENUM __C,
			       _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
							(__v2df) __B,
						        (__D << 2) | __C,
                                                        (__v2df)
                                                        _mm_setzero_pd(),
						        __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getmant_round_ss (__m128 __A, __m128 __B,
		      _MM_MANTISSA_NORM_ENUM __C,
		      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
						  (__v4sf) __B,
						  (__D << 2) | __C,
						  __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			      __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
			      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
						    (__v4sf) __B,
						    (__D << 2) | __C,
                                                    (__v4sf) __W,
						     __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			       _MM_MANTISSA_NORM_ENUM __C,
			       _MM_MANTISSA_SIGN_ENUM __D, const int __R)
{
  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
							(__v4sf) __B,
						        (__D << 2) | __C,
                                                        (__v4sf)
                                                        _mm_setzero_ps(),
						        __U, __R);
}

#else
#define _mm512_getmant_round_pd(X, B, C, R)                                                  \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)(__m512d)_mm512_undefined_pd(), \
                                              (__mmask8)-1,\
					      (R)))

#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R)                                       \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)(__m512d)(W),                 \
                                              (__mmask8)(U),\
					      (R)))

#define _mm512_maskz_getmant_round_pd(U, X, B, C, R)                                         \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)(__m512d)_mm512_setzero_pd(), \
                                              (__mmask8)(U),\
					      (R)))
#define _mm512_getmant_round_ps(X, B, C, R)                                                  \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)(__m512)_mm512_undefined_ps(), \
                                             (__mmask16)-1,\
					     (R)))

#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R)                                       \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)(__m512)(W),                  \
                                             (__mmask16)(U),\
					     (R)))

#define _mm512_maskz_getmant_round_ps(U, X, B, C, R)                                         \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)(__m512)_mm512_setzero_ps(),  \
                                             (__mmask16)(U),\
					     (R)))
#define _mm_getmant_round_sd(X, Y, C, D, R)                                                  \
  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
					    (__v2df)(__m128d)(Y),	\
					    (int)(((D)<<2) | (C)),	\
					    (R)))

#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R)                                       \
  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                  \
					     (__v2df)(__m128d)(Y),                  \
                                             (int)(((D)<<2) | (C)),                 \
                                             (__v2df)(__m128d)(W),                   \
                                             (__mmask8)(U),\
					     (R)))

#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R)                                         \
  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                  \
                                                 (__v2df)(__m128d)(Y),                  \
                                             (int)(((D)<<2) | (C)),              \
                                             (__v2df)(__m128d)_mm_setzero_pd(),  \
                                             (__mmask8)(U),\
					     (R)))

#define _mm_getmant_round_ss(X, Y, C, D, R)                                                  \
  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
					   (__v4sf)(__m128)(Y),		\
					   (int)(((D)<<2) | (C)),	\
					   (R)))

#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R)                                       \
  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                  \
					     (__v4sf)(__m128)(Y),                  \
                                             (int)(((D)<<2) | (C)),                 \
                                             (__v4sf)(__m128)(W),                   \
                                             (__mmask8)(U),\
					     (R)))

#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R)                                         \
  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                  \
                                                 (__v4sf)(__m128)(Y),                  \
                                             (int)(((D)<<2) | (C)),              \
                                             (__v4sf)(__m128)_mm_setzero_ps(),  \
                                             (__mmask8)(U),\
					     (R)))

#define _mm_getexp_round_ss(A, B, R)						      \
  ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R))

#define _mm_mask_getexp_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C)

#define _mm_maskz_getexp_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#define _mm_getexp_round_sd(A, B, R)						       \
  ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R))

#define _mm_mask_getexp_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C)

#define _mm_maskz_getexp_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)


#define _mm512_getexp_round_ps(A, R)						\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R))

#define _mm512_mask_getexp_round_ps(W, U, A, R)					\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)(__m512)(W), (__mmask16)(U), R))

#define _mm512_maskz_getexp_round_ps(U, A, R)					\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R))

#define _mm512_getexp_round_pd(A, R)						\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R))

#define _mm512_mask_getexp_round_pd(W, U, A, R)					\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)(__m512d)(W), (__mmask8)(U), R))

#define _mm512_maskz_getexp_round_pd(U, A, R)					\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
						  (__v16sf)
						  _mm512_undefined_ps (),
						  -1, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C,
				 const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
						  (__v16sf) __A,
						  (__mmask16) __B, __R);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B,
				  const int __imm, const int __R)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
						  __imm,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __A, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
						   (__v8df)
						   _mm512_undefined_pd (),
						   -1, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B,
				 __m512d __C, const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
						   (__v8df) __A,
						   (__mmask8) __B, __R);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B,
				  const int __imm, const int __R)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
						   __imm,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __A, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, const int __R)
{
  return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A,
						   (__v4sf) __B, __imm, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm,
			 const int __R)
{
  return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A,
						    (__v2df) __B, __imm, __R);
}

#else
#define _mm512_roundscale_round_ps(A, B, R) \
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R))
#define _mm512_mask_roundscale_round_ps(A, B, C, D, R)				\
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),	\
					    (int)(D),			\
					    (__v16sf)(__m512)(A),	\
					    (__mmask16)(B), R))
#define _mm512_maskz_roundscale_round_ps(A, B, C, R)				\
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),	\
					    (int)(C),			\
					    (__v16sf)_mm512_setzero_ps(),\
					    (__mmask16)(A), R))
#define _mm512_roundscale_round_pd(A, B, R) \
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R))
#define _mm512_mask_roundscale_round_pd(A, B, C, D, R)				\
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),	\
					     (int)(D),			\
					     (__v8df)(__m512d)(A),	\
					     (__mmask8)(B), R))
#define _mm512_maskz_roundscale_round_pd(A, B, C, R)				\
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),	\
					     (int)(C),			\
					     (__v8df)_mm512_setzero_pd(),\
					     (__mmask8)(A), R))
#define _mm_roundscale_round_ss(A, B, C, R)					\
  ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A),	\
    (__v4sf)(__m128)(B), (int)(C), R))
#define _mm_roundscale_round_sd(A, B, C, R)					\
  ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A),	\
    (__v2df)(__m128d)(B), (int)(C), R))
#endif

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_floor_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
						  _MM_FROUND_FLOOR,
						  (__v16sf) __A, -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_floor_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
						   _MM_FROUND_FLOOR,
						   (__v8df) __A, -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ceil_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
						  _MM_FROUND_CEIL,
						  (__v16sf) __A, -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_ceil_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
						   _MM_FROUND_CEIL,
						   (__v8df) __A, -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
						  _MM_FROUND_FLOOR,
						  (__v16sf) __W, __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
						   _MM_FROUND_FLOOR,
						   (__v8df) __W, __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
						  _MM_FROUND_CEIL,
						  (__v16sf) __W, __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
						   _MM_FROUND_CEIL,
						   (__v8df) __W, __U,
						   _MM_FROUND_CUR_DIRECTION);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
						  (__v16si) __B, __imm,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
			  __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
						  (__v16si) __B, __imm,
						  (__v16si) __W,
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
			   const int __imm)
{
  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
						  (__v16si) __B, __imm,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
						  (__v8di) __B, __imm,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
			  __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
						  (__v8di) __B, __imm,
						  (__v8di) __W,
						  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
			   const int __imm)
{
  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
						  (__v8di) __B, __imm,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  (__mmask8) __U);
}
#else
#define _mm512_alignr_epi32(X, Y, C)                                        \
    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (),\
        (__mmask16)-1))

#define _mm512_mask_alignr_epi32(W, U, X, Y, C)                             \
    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
        (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W),             \
        (__mmask16)(U)))

#define _mm512_maskz_alignr_epi32(U, X, Y, C)                               \
    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (),\
        (__mmask16)(U)))

#define _mm512_alignr_epi64(X, Y, C)                                        \
    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (),  \
	(__mmask8)-1))

#define _mm512_mask_alignr_epi64(W, U, X, Y, C)                             \
    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
        (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U)))

#define _mm512_maskz_alignr_epi64(U, X, Y, C)                               \
    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (),\
        (__mmask8)(U)))
#endif

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
						     (__v16si) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
						    (__v8di) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
						    (__v8di) __B,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
						     (__v16si) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
						     (__v16si) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
						    (__v8di) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
						    (__v8di) __B,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 5,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 5,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 5,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 5,
						    (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 5,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 5,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 5,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 5,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 2,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 2,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 2,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 2,
						    (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 2,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 2,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 2,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 2,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 1,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 1,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 1,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 1,
						    (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 1,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 1,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 1,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 1,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 4,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 4,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 4,
						    (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						    (__v16si) __Y, 4,
						    (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 4,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 4,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 4,
						    (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						    (__v8di) __Y, 4,
						    (__mmask8) -1);
}

#define _MM_CMPINT_EQ	    0x0
#define _MM_CMPINT_LT	    0x1
#define _MM_CMPINT_LE	    0x2
#define _MM_CMPINT_UNUSED   0x3
#define _MM_CMPINT_NE	    0x4
#define _MM_CMPINT_NLT	    0x5
#define _MM_CMPINT_GE	    0x5
#define _MM_CMPINT_NLE	    0x6
#define _MM_CMPINT_GT	    0x6

#ifdef __OPTIMIZE__
extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftli_mask16 (__mmask16 __A, unsigned int __B)
{
  return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A,
						(__mmask8) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftri_mask16 (__mmask16 __A, unsigned int __B)
{
  return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A,
						(__mmask8) __B);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						 (__v8di) __Y, __P,
						 (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						  (__v16si) __Y, __P,
						  (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						  (__v8di) __Y, __P,
						  (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						   (__v16si) __Y, __P,
						   (__mmask16) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P,
			  const int __R)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, __P,
						  (__mmask8) -1, __R);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, __P,
						   (__mmask16) -1, __R);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
						 (__v8di) __Y, __P,
						 (__mmask8) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
						  (__v16si) __Y, __P,
						  (__mmask16) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
						  (__v8di) __Y, __P,
						  (__mmask8) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
						   (__v16si) __Y, __P,
						   (__mmask16) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y,
			       const int __P, const int __R)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, __P,
						  (__mmask8) __U, __R);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y,
			       const int __P, const int __R)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, __P,
						   (__mmask16) __U, __R);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R)
{
  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
					       (__v2df) __Y, __P,
					       (__mmask8) -1, __R);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y,
			    const int __P, const int __R)
{
  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
					       (__v2df) __Y, __P,
					       (__mmask8) __M, __R);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R)
{
  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
					       (__v4sf) __Y, __P,
					       (__mmask8) -1, __R);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
			    const int __P, const int __R)
{
  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
					       (__v4sf) __Y, __P,
					       (__mmask8) __M, __R);
}

#else
#define _kshiftli_mask16(X, Y)						\
  ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y)))

#define _kshiftri_mask16(X, Y)						\
  ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y)))

#define _mm512_cmp_epi64_mask(X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),	\
					   (__v8di)(__m512i)(Y), (int)(P),\
					   (__mmask8)-1))

#define _mm512_cmp_epi32_mask(X, Y, P)					\
  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
					    (__v16si)(__m512i)(Y), (int)(P), \
					    (__mmask16)-1))

#define _mm512_cmp_epu64_mask(X, Y, P)					\
  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
					    (__v8di)(__m512i)(Y), (int)(P),\
					    (__mmask8)-1))

#define _mm512_cmp_epu32_mask(X, Y, P)					\
  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
					     (__v16si)(__m512i)(Y), (int)(P), \
					     (__mmask16)-1))

#define _mm512_cmp_round_pd_mask(X, Y, P, R)				\
  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
					    (__v8df)(__m512d)(Y), (int)(P),\
					    (__mmask8)-1, R))

#define _mm512_cmp_round_ps_mask(X, Y, P, R)				\
  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
					     (__v16sf)(__m512)(Y), (int)(P),\
					     (__mmask16)-1, R))

#define _mm512_mask_cmp_epi64_mask(M, X, Y, P)				\
  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),	\
					   (__v8di)(__m512i)(Y), (int)(P),\
					   (__mmask8)(M)))

#define _mm512_mask_cmp_epi32_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
					    (__v16si)(__m512i)(Y), (int)(P), \
					    (__mmask16)(M)))

#define _mm512_mask_cmp_epu64_mask(M, X, Y, P)				\
  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
					    (__v8di)(__m512i)(Y), (int)(P),\
					    (__mmask8)(M)))

#define _mm512_mask_cmp_epu32_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
					     (__v16si)(__m512i)(Y), (int)(P), \
					     (__mmask16)(M)))

#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R)			\
  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
					    (__v8df)(__m512d)(Y), (int)(P),\
					    (__mmask8)(M), R))

#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R)			\
  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
					     (__v16sf)(__m512)(Y), (int)(P),\
					     (__mmask16)(M), R))

#define _mm_cmp_round_sd_mask(X, Y, P, R)				\
  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
					 (__v2df)(__m128d)(Y), (int)(P),\
					 (__mmask8)-1, R))

#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R)			\
  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
					 (__v2df)(__m128d)(Y), (int)(P),\
					 (M), R))

#define _mm_cmp_round_ss_mask(X, Y, P, R)				\
  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
					 (__v4sf)(__m128)(Y), (int)(P), \
					 (__mmask8)-1, R))

#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R)			\
  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
					 (__v4sf)(__m128)(Y), (int)(P), \
					 (M), R))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale)
{
  __m512 __v1_old = _mm512_undefined_ps ();
  __mmask16 __mask = 0xFFFF;

  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
						__addr,
						(__v16si) __index,
						__mask, __scale);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask,
			  __m512i __index, void const *__addr, int __scale)
{
  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
						__addr,
						(__v16si) __index,
						__mask, __scale);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale)
{
  __m512d __v1_old = _mm512_undefined_pd ();
  __mmask8 __mask = 0xFF;

  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
						__addr,
						(__v8si) __index, __mask,
						__scale);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask,
			  __m256i __index, void const *__addr, int __scale)
{
  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
						__addr,
						(__v8si) __index,
						__mask, __scale);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale)
{
  __m256 __v1_old = _mm256_undefined_ps ();
  __mmask8 __mask = 0xFF;

  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
						__addr,
						(__v8di) __index, __mask,
						__scale);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask,
			  __m512i __index, void const *__addr, int __scale)
{
  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
						__addr,
						(__v8di) __index,
						__mask, __scale);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale)
{
  __m512d __v1_old = _mm512_undefined_pd ();
  __mmask8 __mask = 0xFF;

  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
						__addr,
						(__v8di) __index, __mask,
						__scale);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask,
			  __m512i __index, void const *__addr, int __scale)
{
  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
						__addr,
						(__v8di) __index,
						__mask, __scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale)
{
  __m512i __v1_old = _mm512_undefined_epi32 ();
  __mmask16 __mask = 0xFFFF;

  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
						 __addr,
						 (__v16si) __index,
						 __mask, __scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask,
			     __m512i __index, void const *__addr, int __scale)
{
  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
						 __addr,
						 (__v16si) __index,
						 __mask, __scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale)
{
  __m512i __v1_old = _mm512_undefined_epi32 ();
  __mmask8 __mask = 0xFF;

  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
						__addr,
						(__v8si) __index, __mask,
						__scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask,
			     __m256i __index, void const *__addr,
			     int __scale)
{
  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
						__addr,
						(__v8si) __index,
						__mask, __scale);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale)
{
  __m256i __v1_old = _mm256_undefined_si256 ();
  __mmask8 __mask = 0xFF;

  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
						 __addr,
						 (__v8di) __index,
						 __mask, __scale);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask,
			     __m512i __index, void const *__addr, int __scale)
{
  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
						 __addr,
						 (__v8di) __index,
						 __mask, __scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale)
{
  __m512i __v1_old = _mm512_undefined_epi32 ();
  __mmask8 __mask = 0xFF;

  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
						__addr,
						(__v8di) __index, __mask,
						__scale);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask,
			     __m512i __index, void const *__addr,
			     int __scale)
{
  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
						__addr,
						(__v8di) __index,
						__mask, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale)
{
  __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF,
				 (__v16si) __index, (__v16sf) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask,
			   __m512i __index, __m512 __v1, int __scale)
{
  __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index,
				 (__v16sf) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1,
		      int __scale)
{
  __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF,
				(__v8si) __index, (__v8df) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
			   __m256i __index, __m512d __v1, int __scale)
{
  __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index,
				(__v8df) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale)
{
  __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF,
				 (__v8di) __index, (__v8sf) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
			   __m512i __index, __m256 __v1, int __scale)
{
  __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index,
				 (__v8sf) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1,
		      int __scale)
{
  __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF,
				(__v8di) __index, (__v8df) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
			   __m512i __index, __m512d __v1, int __scale)
{
  __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index,
				(__v8df) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32scatter_epi32 (void *__addr, __m512i __index,
			 __m512i __v1, int __scale)
{
  __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF,
				 (__v16si) __index, (__v16si) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask,
			      __m512i __index, __m512i __v1, int __scale)
{
  __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index,
				 (__v16si) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i32scatter_epi64 (void *__addr, __m256i __index,
			 __m512i __v1, int __scale)
{
  __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF,
				(__v8si) __index, (__v8di) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
			      __m256i __index, __m512i __v1, int __scale)
{
  __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index,
				(__v8di) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64scatter_epi32 (void *__addr, __m512i __index,
			 __m256i __v1, int __scale)
{
  __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF,
				 (__v8di) __index, (__v8si) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
			      __m512i __index, __m256i __v1, int __scale)
{
  __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index,
				 (__v8si) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_i64scatter_epi64 (void *__addr, __m512i __index,
			 __m512i __v1, int __scale)
{
  __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF,
				(__v8di) __index, (__v8di) __v1, __scale);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
			      __m512i __index, __m512i __v1, int __scale)
{
  __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index,
				(__v8di) __v1, __scale);
}
#else
#define _mm512_i32gather_ps(INDEX, ADDR, SCALE)				\
  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(),\
					 (void const *) (ADDR),		\
					 (__v16si)(__m512i) (INDEX),	\
					 (__mmask16)0xFFFF,		\
					 (int) (SCALE))

#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v16si)(__m512i) (INDEX),	\
					 (__mmask16) (MASK),		\
					 (int) (SCALE))

#define _mm512_i32gather_pd(INDEX, ADDR, SCALE)				\
  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(),	\
					 (void const *) (ADDR),		\
					 (__v8si)(__m256i) (INDEX),	\
					 (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v8si)(__m256i) (INDEX),	\
					 (__mmask8) (MASK),		\
					 (int) (SCALE))

#define _mm512_i64gather_ps(INDEX, ADDR, SCALE)				\
  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(),	\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8) (MASK),		\
					 (int) (SCALE))

#define _mm512_i64gather_pd(INDEX, ADDR, SCALE)				\
  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(),	\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8) (MASK),		\
					 (int) (SCALE))

#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE)			\
  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (),\
					  (void const *) (ADDR),	\
					  (__v16si)(__m512i) (INDEX),	\
					  (__mmask16)0xFFFF,		\
					  (int) (SCALE))

#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i) (V1OLD),	\
					  (void const *) (ADDR),	\
					  (__v16si)(__m512i) (INDEX),	\
					  (__mmask16) (MASK),		\
					  (int) (SCALE))

#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE)			\
  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (),\
					 (void const *) (ADDR),		\
					 (__v8si)(__m256i) (INDEX),	\
					 (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v8si)(__m256i) (INDEX),	\
					 (__mmask8) (MASK),		\
					 (int) (SCALE))

#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE)			   \
  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(),\
					  (void const *) (ADDR),	   \
					  (__v8di)(__m512i) (INDEX),	   \
					  (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i) (V1OLD),	\
					  (void const *) (ADDR),	\
					  (__v8di)(__m512i) (INDEX),	\
					  (__mmask8) (MASK),		\
					  (int) (SCALE))

#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE)			\
  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (),\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8)0xFF, (int) (SCALE))

#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)	\
  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i) (V1OLD),	\
					 (void const *) (ADDR),		\
					 (__v8di)(__m512i) (INDEX),	\
					 (__mmask8) (MASK),		\
					 (int) (SCALE))

#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16)0xFFFF,	\
				 (__v16si)(__m512i) (INDEX),		\
				 (__v16sf)(__m512) (V1), (int) (SCALE))

#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)		\
  __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16) (MASK),	\
				 (__v16si)(__m512i) (INDEX),		\
				 (__v16sf)(__m512) (V1), (int) (SCALE))

#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8)0xFF,	\
				(__v8si)(__m256i) (INDEX),		\
				(__v8df)(__m512d) (V1), (int) (SCALE))

#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)		\
  __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8) (MASK),	\
				(__v8si)(__m256i) (INDEX),		\
				(__v8df)(__m512d) (V1), (int) (SCALE))

#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask8)0xFF,	\
				 (__v8di)(__m512i) (INDEX),		\
				 (__v8sf)(__m256) (V1), (int) (SCALE))

#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)		\
  __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask16) (MASK),	\
				 (__v8di)(__m512i) (INDEX),		\
				 (__v8sf)(__m256) (V1), (int) (SCALE))

#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8)0xFF,	\
				(__v8di)(__m512i) (INDEX),		\
				(__v8df)(__m512d) (V1), (int) (SCALE))

#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)		\
  __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8) (MASK),	\
				(__v8di)(__m512i) (INDEX),		\
				(__v8df)(__m512d) (V1), (int) (SCALE))

#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16)0xFFFF,	\
				 (__v16si)(__m512i) (INDEX),		\
				 (__v16si)(__m512i) (V1), (int) (SCALE))

#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)	\
  __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16) (MASK),	\
				 (__v16si)(__m512i) (INDEX),		\
				 (__v16si)(__m512i) (V1), (int) (SCALE))

#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8)0xFF,	\
				(__v8si)(__m256i) (INDEX),		\
				(__v8di)(__m512i) (V1), (int) (SCALE))

#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)	\
  __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8) (MASK),	\
				(__v8si)(__m256i) (INDEX),		\
				(__v8di)(__m512i) (V1), (int) (SCALE))

#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8)0xFF,	\
				 (__v8di)(__m512i) (INDEX),		\
				 (__v8si)(__m256i) (V1), (int) (SCALE))

#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)	\
  __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8) (MASK),	\
				 (__v8di)(__m512i) (INDEX),		\
				 (__v8si)(__m256i) (V1), (int) (SCALE))

#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE)			\
  __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8)0xFF,	\
				(__v8di)(__m512i) (INDEX),		\
				(__v8di)(__m512i) (V1), (int) (SCALE))

#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)	\
  __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8) (MASK),	\
				(__v8di)(__m512i) (INDEX),		\
				(__v8di)(__m512i) (V1), (int) (SCALE))
#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
						      (__v8df) __W,
						      (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
						      (__v8df)
						      _mm512_setzero_pd (),
						      (__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
{
  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
					  (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
						     (__v16sf) __W,
						     (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
						     (__v16sf)
						     _mm512_setzero_ps (),
						     (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
{
  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
					  (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
						      (__v8di) __W,
						      (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
						      (__v8di)
						      _mm512_setzero_si512 (),
						      (__mmask8) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
{
  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
					  (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
						      (__v16si) __W,
						      (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
						      (__v16si)
						      _mm512_setzero_si512 (),
						      (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
{
  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
					  (__mmask16) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
						    (__v8df) __W,
						    (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A,
						     (__v8df)
						     _mm512_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P,
							(__v8df) __W,
							(__mmask8) __U);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
{
  return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P,
							 (__v8df)
							 _mm512_setzero_pd (),
							 (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
						   (__v16sf) __W,
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P,
						       (__v16sf) __W,
						       (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P)
{
  return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P,
							(__v16sf)
							_mm512_setzero_ps (),
							(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A,
						     (__v8di)
						     _mm512_setzero_si512 (),
						     (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P,
							(__v8di) __W,
							(__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
{
  return (__m512i)
	 __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P,
					       (__v8di)
					       _mm512_setzero_si512 (),
					       (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P,
							(__v16si) __W,
							(__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P,
							 (__v16si)
							 _mm512_setzero_si512
							 (), (__mmask16) __U);
}

/* Mask arithmetic operations */
#define _kand_mask16 _mm512_kand
#define _kandn_mask16 _mm512_kandn
#define _knot_mask16 _mm512_knot
#define _kor_mask16 _mm512_kor
#define _kxnor_mask16 _mm512_kxnor
#define _kxor_mask16 _mm512_kxor

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortest_mask16_u8  (__mmask16 __A,  __mmask16 __B, unsigned char *__CF)
{
  *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B);
  return (unsigned char) __builtin_ia32_kortestzhi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B)
{
  return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A,
						    (__mmask16) __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B)
{
  return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A,
						    (__mmask16) __B);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtmask16_u32 (__mmask16 __A)
{
  return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtu32_mask16 (unsigned int __A)
{
  return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_load_mask16 (__mmask16 *__A)
{
  return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_store_mask16 (__mmask16 *__A, __mmask16 __B)
{
  *(__mmask16 *) __A = __builtin_ia32_kmovw (__B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kand (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kandn (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A,
					     (__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kor (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kortestz (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A,
						(__mmask16) __B);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kortestc (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A,
						(__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kxnor (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kxor (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_knot (__mmask16 __A)
{
  return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
{
  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kunpackb_mask16 (__mmask8 __A, __mmask8 __B)
{
  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
}

#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D,
			  const int __imm)
{
  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
						    (__v4si) __D,
						    __imm,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    __B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D,
			  const int __imm)
{
  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
						   (__v4sf) __D,
						   __imm,
						   (__v16sf)
						   _mm512_setzero_ps (), __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C,
			 __m128i __D, const int __imm)
{
  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
						    (__v4si) __D,
						    __imm,
						    (__v16si) __A,
						    __B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C,
			 __m128 __D, const int __imm)
{
  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
						   (__v4sf) __D,
						   __imm,
						   (__v16sf) __A, __B);
}
#else
#define _mm512_maskz_insertf32x4(A, X, Y, C)                            \
  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(),      \
    (__mmask16)(A)))

#define _mm512_maskz_inserti32x4(A, X, Y, C)                            \
  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
    (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (),     \
    (__mmask16)(A)))

#define _mm512_mask_insertf32x4(A, B, X, Y, C)                          \
  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A),             \
					     (__mmask16)(B)))

#define _mm512_mask_inserti32x4(A, B, X, Y, C)                          \
  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A),           \
					      (__mmask16)(B)))
#endif

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epi64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epu64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epu64 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_undefined_epi32 (),
						  (__mmask8) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
						  (__v8di) __B,
						  (__v8di)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epu32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W, __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epu32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_undefined_epi32 (),
						  (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si)
						  _mm512_setzero_si512 (),
						  __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
						  (__v16si) __B,
						  (__v16si) __W, __M);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_ps (__m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __W,
						   (__mmask16) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_round_sd (__m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
					       (__v2df) __B,
					       __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
			   const int __R)
{
  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_round_ss (__m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
					      (__v4sf) __B,
					      __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U, __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
			   const int __R)
{
  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U, __R);
}

#else
#define _mm_max_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_maxsd_round(A, B, C)

#define _mm_mask_max_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C)

#define _mm_maskz_max_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_max_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_maxss_round(A, B, C)

#define _mm_mask_max_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C)

#define _mm_maskz_max_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#define _mm_min_round_sd(A, B, C)            \
    (__m128d)__builtin_ia32_minsd_round(A, B, C)

#define _mm_mask_min_round_sd(W, U, A, B, C) \
    (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C)

#define _mm_maskz_min_round_sd(U, A, B, C)   \
    (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)

#define _mm_min_round_ss(A, B, C)            \
    (__m128)__builtin_ia32_minss_round(A, B, C)

#define _mm_mask_min_round_ss(W, U, A, B, C) \
    (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C)

#define _mm_maskz_min_round_ss(U, A, B, C)   \
    (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)

#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W)
{
  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
						     (__v8df) __W,
						     (__mmask8) __U);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W)
{
  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
						    (__v16sf) __W,
						    (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W)
{
  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
						    (__v8di) __W,
						    (__mmask8) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W)
{
  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
						    (__v16si) __W,
						    (__mmask16) __U);
}

#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
						   (__v2df) __A,
						   (__v2df) __B,
						   __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
						  (__v4sf) __A,
						  (__v4sf) __B,
						  __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
						   (__v2df) __A,
						   -(__v2df) __B,
						   __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
						  (__v4sf) __A,
						  -(__v4sf) __B,
						  __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
						   -(__v2df) __A,
						   (__v2df) __B,
						   __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
						  -(__v4sf) __A,
						  (__v4sf) __B,
						  __R);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
{
  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
						   -(__v2df) __A,
						   -(__v2df) __B,
						   __R);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
{
  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
						  -(__v4sf) __A,
						  -(__v4sf) __B,
						  __R);
}
#else
#define _mm_fmadd_round_sd(A, B, C, R)            \
    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R)

#define _mm_fmadd_round_ss(A, B, C, R)            \
    (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R)

#define _mm_fmsub_round_sd(A, B, C, R)            \
    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R)

#define _mm_fmsub_round_ss(A, B, C, R)            \
    (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R)

#define _mm_fnmadd_round_sd(A, B, C, R)            \
    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R)

#define _mm_fnmadd_round_ss(A, B, C, R)            \
   (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R)

#define _mm_fnmsub_round_sd(A, B, C, R)            \
    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R)

#define _mm_fnmsub_round_ss(A, B, C, R)            \
    (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R)
#endif

#ifdef __OPTIMIZE__
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R)
{
  return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R)
{
  return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R);
}
#else
#define _mm_comi_round_ss(A, B, C, D)\
__builtin_ia32_vcomiss(A, B, C, D)
#define _mm_comi_round_sd(A, B, C, D)\
__builtin_ia32_vcomisd(A, B, C, D)
#endif

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sqrt_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df)
						  _mm512_undefined_pd (),
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df) __W,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
						  (__v8df)
						  _mm512_setzero_pd (),
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sqrt_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf)
						 _mm512_undefined_ps (),
						 (__mmask16) -1,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf) __W,
						 (__mmask16) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
						 (__v16sf)
						 _mm512_setzero_ps (),
						 (__mmask16) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_pd (__m512d __A, __m512d __B)
{
  return (__m512d) ((__v8df)__A + (__v8df)__B);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_ps (__m512 __A, __m512 __B)
{
  return (__m512) ((__v16sf)__A + (__v16sf)__B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df)
						_mm_setzero_pd (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_pd (__m512d __A, __m512d __B)
{
  return (__m512d) ((__v8df)__A - (__v8df)__B);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_ps (__m512 __A, __m512 __B)
{
  return (__m512) ((__v16sf)__A - (__v16sf)__B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df)
						_mm_setzero_pd (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_pd (__m512d __A, __m512d __B)
{
  return (__m512d) ((__v8df)__A * (__v8df)__B);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_ps (__m512 __A, __m512 __B)
{
  return (__m512) ((__v16sf)__A * (__v16sf)__B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B)
{
  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B)
{
  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_pd (__m512d __M, __m512d __V)
{
  return (__m512d) ((__v8df)__M / (__v8df)__V);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
{
  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
						 (__v8df) __V,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V)
{
  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
						 (__v8df) __V,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_ps (__m512 __A, __m512 __B)
{
  return (__m512) ((__v16sf)__A / (__v16sf)__B);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A,
			  __m128d __B)
{
  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A,
			  __m128 __B)
{
  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_pd (__m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_ps (__m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_pd (__m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_undefined_pd (),
						 (__mmask8) -1,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
						 (__v8df) __B,
						 (__v8df)
						 _mm512_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_ps (__m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_undefined_ps (),
						(__mmask16) -1,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf) __W,
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
						(__v16sf) __B,
						(__v16sf)
						_mm512_setzero_ps (),
						(__mmask16) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_scalef_pd (__m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __W,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
{
  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_scalef_ps (__m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __W,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
{
  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_scalef_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
						    (__v2df) __B,
						    (__v2df)
						    _mm_setzero_pd (),
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_scalef_ss (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
						   (__v4sf) __B,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
						     (__v8df) __B,
						     -(__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
						    (__v16sf) __B,
						    -(__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       (__v8df) __C,
						       (__mmask8) -1,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       (__v8df) __C,
						       (__mmask8) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U,
							_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U,
							_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16sf) __C,
						      (__mmask16) -1,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16sf) __C,
						      (__mmask16) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       -(__v8df) __C,
						       (__mmask8) -1,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
						       (__v8df) __B,
						       -(__v8df) __C,
						       (__mmask8) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
							(__v8df) __B,
							(__v8df) __C,
							(__mmask8) __U,
							_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
							(__v8df) __B,
							-(__v8df) __C,
							(__mmask8) __U,
							_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      -(__v16sf) __C,
						      (__mmask16) -1,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
						      (__v16sf) __B,
						      -(__v16sf) __C,
						      (__mmask16) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
						       (__v16sf) __B,
						       (__v16sf) __C,
						       (__mmask16) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
						       (__v16sf) __B,
						       -(__v16sf) __C,
						       (__mmask16) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
						    (__v8df) __B,
						    (__v8df) __C,
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
						   (__v16sf) __B,
						   (__v16sf) __C,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
						    (__v8df) __B,
						    -(__v8df) __C,
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
						     (__v8df) __B,
						     (__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
{
  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
						      (__v8df) __B,
						      (__v8df) __C,
						      (__mmask8) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
{
  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
						     (__v8df) __B,
						     -(__v8df) __C,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
						   (__v16sf) __B,
						   -(__v16sf) __C,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
						    (__v16sf) __B,
						    (__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
{
  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16sf) __C,
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
{
  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
						    (__v16sf) __B,
						    -(__v16sf) __C,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvttpd_epi32 (__m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_undefined_si256 (),
						     (__mmask8) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si) __W,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvttpd_epu32 (__m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si)
						      _mm256_undefined_si256 (),
						      (__mmask8) -1,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si) __W,
						      (__mmask8) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
						      (__v8si)
						      _mm256_setzero_si256 (),
						      (__mmask8) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtpd_epi32 (__m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si)
						    _mm256_undefined_si256 (),
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si) __W,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
						    (__v8si)
						    _mm256_setzero_si256 (),
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtpd_epu32 (__m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_undefined_si256 (),
						     (__mmask8) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si) __W,
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
						     (__v8si)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvttps_epi32 (__m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si) __W,
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvttps_epu32 (__m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si)
						      _mm512_undefined_epi32 (),
						      (__mmask16) -1,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si) __W,
						      (__mmask16) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
						      (__v16si)
						      _mm512_setzero_si512 (),
						      (__mmask16) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtps_epi32 (__m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si)
						    _mm512_undefined_epi32 (),
						    (__mmask16) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si) __W,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
						    (__v16si)
						    _mm512_setzero_si512 (),
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtps_epu32 (__m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_undefined_epi32 (),
						     (__mmask16) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si) __W,
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
{
  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
						     (__v16si)
						     _mm512_setzero_si512 (),
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsd_f64 (__m512d __A)
{
  return __A[0];
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtss_f32 (__m512 __A)
{
  return __A[0];
}

#ifdef __x86_64__
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
{
  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
					      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
{
  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
					       _MM_FROUND_CUR_DIRECTION);
}
#endif

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtu32_ss (__m128 __A, unsigned __B)
{
  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
					      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi32_ps (__m512i __A)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf) __W,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
{
  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu32_ps (__m512i __A)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf) __W,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
{
  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
						      (__v8df) __B,
						      (__v8di) __C,
						      __imm,
						      (__mmask8) -1,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B,
			 __m512i __C, const int __imm)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
						      (__v8df) __B,
						      (__v8di) __C,
						      __imm,
						      (__mmask8) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B,
			  __m512i __C, const int __imm)
{
  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
						       (__v8df) __B,
						       (__v8di) __C,
						       __imm,
						       (__mmask8) __U,
						       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm)
{
  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16si) __C,
						     __imm,
						     (__mmask16) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B,
			 __m512i __C, const int __imm)
{
  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
						     (__v16sf) __B,
						     (__v16si) __C,
						     __imm,
						     (__mmask16) __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B,
			  __m512i __C, const int __imm)
{
  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
						      (__v16sf) __B,
						      (__v16si) __C,
						      __imm,
						      (__mmask16) __U,
						      _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm)
{
  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
						   (__v2df) __B,
						   (__v2di) __C, __imm,
						   (__mmask8) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B,
		      __m128i __C, const int __imm)
{
  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
						   (__v2df) __B,
						   (__v2di) __C, __imm,
						   (__mmask8) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B,
		       __m128i __C, const int __imm)
{
  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
						    (__v2df) __B,
						    (__v2di) __C,
						    __imm,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm)
{
  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
						  (__v4sf) __B,
						  (__v4si) __C, __imm,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B,
		      __m128i __C, const int __imm)
{
  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
						  (__v4sf) __B,
						  (__v4si) __C, __imm,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B,
		       __m128i __C, const int __imm)
{
  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
						   (__v4sf) __B,
						   (__v4si) __C, __imm,
						   (__mmask8) __U,
						   _MM_FROUND_CUR_DIRECTION);
}
#else
#define _mm512_fixupimm_pd(X, Y, Z, C)					\
  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),	\
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),		\
      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C)                          \
  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C)                         \
  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_fixupimm_ps(X, Y, Z, C)					\
  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),	\
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),		\
    (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C)                          \
  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C)                         \
  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm_fixupimm_sd(X, Y, Z, C)					\
    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_fixupimm_sd(X, U, Y, Z, C)				\
    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C)				\
    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),	\
      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm_fixupimm_ss(X, Y, Z, C)					\
    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_fixupimm_ss(X, U, Y, Z, C)				\
    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C)				\
    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),	\
      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
#endif

#ifdef __x86_64__
extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_u64 (__m128 __A)
{
  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
							   __A,
							   _MM_FROUND_CUR_DIRECTION);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_u64 (__m128 __A)
{
  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
							    __A,
							    _MM_FROUND_CUR_DIRECTION);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_i64 (__m128 __A)
{
  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
						  _MM_FROUND_CUR_DIRECTION);
}
#endif /* __x86_64__ */

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_u32 (__m128 __A)
{
  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_u32 (__m128 __A)
{
  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_i32 (__m128 __A)
{
  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
					    _MM_FROUND_CUR_DIRECTION);
}

#ifdef __x86_64__
extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_u64 (__m128d __A)
{
  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
							   __A,
							   _MM_FROUND_CUR_DIRECTION);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_u64 (__m128d __A)
{
  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
							    __A,
							    _MM_FROUND_CUR_DIRECTION);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_i64 (__m128d __A)
{
  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
						  _MM_FROUND_CUR_DIRECTION);
}
#endif /* __x86_64__ */

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_u32 (__m128d __A)
{
  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
						 _MM_FROUND_CUR_DIRECTION);
}

extern __inline unsigned
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_u32 (__m128d __A)
{
  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_i32 (__m128d __A)
{
  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
					    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtps_pd (__m256 __A)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df) __W,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
{
  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtph_ps (__m256i __A)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf)
						    _mm512_undefined_ps (),
						    (__mmask16) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf) __W,
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
{
  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    (__mmask16) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtpd_ps (__m512d __A)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf)
						   _mm256_undefined_ps (),
						   (__mmask8) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf) __W,
						   (__mmask8) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
{
  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
						   (__v8sf)
						   _mm256_setzero_ps (),
						   (__mmask8) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getexp_ps (__m512 __A)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_undefined_ps (),
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf) __W,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
{
  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
						   (__v16sf)
						   _mm512_setzero_ps (),
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getexp_pd (__m512d __A)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df)
						    _mm512_undefined_pd (),
						    (__mmask8) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df) __W,
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
{
  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
						    (__v8df)
						    _mm512_setzero_pd (),
						    (__mmask8) __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getexp_ss (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
						    (__v4sf) __B,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getexp_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
						     (__v2df) __B,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df) __W,
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
						(__v2df) __B,
						(__v2df)
						_mm_setzero_pd (),
						(__mmask8) __U,
						_MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
		   _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     _mm512_undefined_pd (),
						     (__mmask8) -1,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A,
			_MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     (__v8df) __W, __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A,
			 _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
						     (__C << 2) | __B,
						     (__v8df)
						     _mm512_setzero_pd (),
						     __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
		   _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    _mm512_undefined_ps (),
						    (__mmask16) -1,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A,
			_MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    (__v16sf) __W, __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A,
			 _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
{
  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
						    (__C << 2) | __B,
						    (__v16sf)
						    _mm512_setzero_ps (),
						    __U,
						    _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
		_MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
						   (__v2df) __B,
						   (__D << 2) | __C,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
			_MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
							(__v2df) __B,
						        (__D << 2) | __C,
                                                        (__v2df) __W,
						       __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B,
			 _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
                                                        (__v2df) __B,
						        (__D << 2) | __C,
                                                        (__v2df)
							_mm_setzero_pd(),
						        __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
		_MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
						  (__v4sf) __B,
						  (__D << 2) | __C,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
			_MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
							(__v4sf) __B,
						        (__D << 2) | __C,
                                                        (__v4sf) __W,
						       __U,
						     _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B,
			 _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
{
  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
                                                        (__v4sf) __B,
						        (__D << 2) | __C,
                                                        (__v4sf)
							_mm_setzero_ps(),
						        __U,
						     _MM_FROUND_CUR_DIRECTION);
}

#else
#define _mm512_getmant_pd(X, B, C)                                                  \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)_mm512_undefined_pd(),        \
                                              (__mmask8)-1,\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_getmant_pd(W, U, X, B, C)                                       \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)(__m512d)(W),                 \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_getmant_pd(U, X, B, C)                                         \
  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
                                              (int)(((C)<<2) | (B)),                \
                                              (__v8df)_mm512_setzero_pd(),          \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))
#define _mm512_getmant_ps(X, B, C)                                                  \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)_mm512_undefined_ps(),        \
                                             (__mmask16)-1,\
					     _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_getmant_ps(W, U, X, B, C)                                       \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)(__m512)(W),                  \
                                             (__mmask16)(U),\
					     _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_getmant_ps(U, X, B, C)                                         \
  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
                                             (int)(((C)<<2) | (B)),                 \
                                             (__v16sf)_mm512_setzero_ps(),          \
                                             (__mmask16)(U),\
					     _MM_FROUND_CUR_DIRECTION))
#define _mm_getmant_sd(X, Y, C, D)                                                  \
  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
                                           (__v2df)(__m128d)(Y),                    \
                                           (int)(((D)<<2) | (C)),                   \
					   _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_getmant_sd(W, U, X, Y, C, D)                                       \
  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                 \
                                                 (__v2df)(__m128d)(Y),                 \
                                                 (int)(((D)<<2) | (C)),                \
                                                (__v2df)(__m128d)(W),                 \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm_maskz_getmant_sd(U, X, Y, C, D)                                         \
  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                 \
                                           (__v2df)(__m128d)(Y),                     \
                                              (int)(((D)<<2) | (C)),                \
                                           (__v2df)_mm_setzero_pd(),             \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm_getmant_ss(X, Y, C, D)                                                  \
  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
                                          (__v4sf)(__m128)(Y),                      \
                                          (int)(((D)<<2) | (C)),                    \
					  _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_getmant_ss(W, U, X, Y, C, D)                                       \
  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                 \
                                                 (__v4sf)(__m128)(Y),                 \
                                                 (int)(((D)<<2) | (C)),                \
                                                (__v4sf)(__m128)(W),                 \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm_maskz_getmant_ss(U, X, Y, C, D)                                         \
  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                 \
                                           (__v4sf)(__m128)(Y),                     \
                                              (int)(((D)<<2) | (C)),                \
                                           (__v4sf)_mm_setzero_ps(),             \
                                              (__mmask8)(U),\
					      _MM_FROUND_CUR_DIRECTION))

#define _mm_getexp_ss(A, B)						      \
  ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B),  \
					   _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_getexp_ss(W, U, A, B) \
    (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\
                                             _MM_FROUND_CUR_DIRECTION)

#define _mm_maskz_getexp_ss(U, A, B)   \
    (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\
					      _MM_FROUND_CUR_DIRECTION)

#define _mm_getexp_sd(A, B)						       \
  ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\
					    _MM_FROUND_CUR_DIRECTION))

#define _mm_mask_getexp_sd(W, U, A, B) \
    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\
                                             _MM_FROUND_CUR_DIRECTION)

#define _mm_maskz_getexp_sd(U, A, B)   \
    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\
					      _MM_FROUND_CUR_DIRECTION)

#define _mm512_getexp_ps(A)						\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_getexp_ps(W, U, A)					\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_getexp_ps(U, A)					\
  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_getexp_pd(A)						\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_getexp_pd(W, U, A)					\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))

#define _mm512_maskz_getexp_pd(U, A)					\
  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
#endif

#ifdef __OPTIMIZE__
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_roundscale_ps (__m512 __A, const int __imm)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
						  (__v16sf)
						  _mm512_undefined_ps (),
						  -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C,
			   const int __imm)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
						  (__v16sf) __A,
						  (__mmask16) __B,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm)
{
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
						  __imm,
						  (__v16sf)
						  _mm512_setzero_ps (),
						  (__mmask16) __A,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_roundscale_pd (__m512d __A, const int __imm)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
						   (__v8df)
						   _mm512_undefined_pd (),
						   -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C,
			   const int __imm)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
						   (__v8df) __A,
						   (__mmask8) __B,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm)
{
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
						   __imm,
						   (__v8df)
						   _mm512_setzero_pd (),
						   (__mmask8) __A,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm)
{
  return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A,
						   (__v4sf) __B, __imm,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm)
{
  return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A,
						    (__v2df) __B, __imm,
						   _MM_FROUND_CUR_DIRECTION);
}

#else
#define _mm512_roundscale_ps(A, B) \
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_roundscale_ps(A, B, C, D)				\
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),	\
					    (int)(D),			\
					    (__v16sf)(__m512)(A),	\
					    (__mmask16)(B), _MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_roundscale_ps(A, B, C)				\
  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),	\
					    (int)(C),			\
					    (__v16sf)_mm512_setzero_ps(),\
					    (__mmask16)(A), _MM_FROUND_CUR_DIRECTION))
#define _mm512_roundscale_pd(A, B) \
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_roundscale_pd(A, B, C, D)				\
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),	\
					     (int)(D),			\
					     (__v8df)(__m512d)(A),	\
					     (__mmask8)(B), _MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_roundscale_pd(A, B, C)				\
  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),	\
					     (int)(C),			\
					     (__v8df)_mm512_setzero_pd(),\
					     (__mmask8)(A), _MM_FROUND_CUR_DIRECTION))
#define _mm_roundscale_ss(A, B, C)					\
  ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A),	\
  (__v4sf)(__m128)(B), (int)(C), _MM_FROUND_CUR_DIRECTION))
#define _mm_roundscale_sd(A, B, C)					\
  ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A),	\
    (__v2df)(__m128d)(B), (int)(C), _MM_FROUND_CUR_DIRECTION))
#endif

#ifdef __OPTIMIZE__
extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, __P,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, __P,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, __P,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, __P,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
					       (__v2df) __Y, __P,
					       (__mmask8) -1,
					       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
					       (__v2df) __Y, __P,
					       (__mmask8) __M,
					       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
					       (__v4sf) __Y, __P,
					       (__mmask8) -1,
					       _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
					       (__v4sf) __Y, __P,
					       (__mmask8) __M,
					       _MM_FROUND_CUR_DIRECTION);
}

#else
#define _mm512_cmp_pd_mask(X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
					    (__v8df)(__m512d)(Y), (int)(P),\
					    (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))

#define _mm512_cmp_ps_mask(X, Y, P)					\
  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
					     (__v16sf)(__m512)(Y), (int)(P),\
					     (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_cmp_pd_mask(M, X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
					    (__v8df)(__m512d)(Y), (int)(P),\
					    (__mmask8)(M), _MM_FROUND_CUR_DIRECTION))

#define _mm512_mask_cmp_ps_mask(M, X, Y, P)					\
  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
					     (__v16sf)(__m512)(Y), (int)(P),\
					     (__mmask16)(M),_MM_FROUND_CUR_DIRECTION))

#define _mm_cmp_sd_mask(X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
					 (__v2df)(__m128d)(Y), (int)(P),\
					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))

#define _mm_mask_cmp_sd_mask(M, X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
					 (__v2df)(__m128d)(Y), (int)(P),\
					 M,_MM_FROUND_CUR_DIRECTION))

#define _mm_cmp_ss_mask(X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
					 (__v4sf)(__m128)(Y), (int)(P), \
					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))

#define _mm_mask_cmp_ss_mask(M, X, Y, P)					\
  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
					 (__v4sf)(__m128)(Y), (int)(P), \
					 M,_MM_FROUND_CUR_DIRECTION))
#endif

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_EQ_OQ,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_EQ_OQ,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_LT_OS,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_LT_OS,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_LE_OS,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_LE_OS,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_UNORD_Q,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_UNORD_Q,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NEQ_UQ,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NEQ_UQ,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NLT_US,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NLT_US,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NLE_US,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_NLE_US,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_ORD_Q,
						  (__mmask8) -1,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
{
  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
						  (__v8df) __Y, _CMP_ORD_Q,
						  (__mmask8) __U,
						  _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_EQ_OQ,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_EQ_OQ,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_LT_OS,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_LT_OS,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_LE_OS,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_LE_OS,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_UNORD_Q,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_UNORD_Q,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NEQ_UQ,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NEQ_UQ,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NLT_US,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NLT_US,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NLE_US,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_NLE_US,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y)
{
  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_ORD_Q,
						   (__mmask16) -1,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
{
   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
						   (__v16sf) __Y, _CMP_ORD_Q,
						   (__mmask16) __U,
						   _MM_FROUND_CUR_DIRECTION);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kmov (__mmask16 __A)
{
  return __builtin_ia32_kmovw (__A);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd_ps (__m512d __A)
{
  return (__m512) (__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd_si512 (__m512d __A)
{
  return (__m512i) (__A);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps_pd (__m512 __A)
{
  return (__m512d) (__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps_si512 (__m512 __A)
{
  return (__m512i) (__A);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi512_ps (__m512i __A)
{
  return (__m512) (__A);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi512_pd (__m512i __A)
{
  return (__m512d) (__A);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd512_pd128 (__m512d __A)
{
  return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps512_ps128 (__m512 __A)
{
  return _mm512_extractf32x4_ps(__A, 0);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi512_si128 (__m512i __A)
{
  return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd512_pd256 (__m512d __A)
{
  return _mm512_extractf64x4_pd(__A, 0);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps512_ps256 (__m512 __A)
{
  return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi512_si256 (__m512i __A)
{
  return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd128_pd512 (__m128d __A)
{
  return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps128_ps512 (__m128 __A)
{
  return (__m512) __builtin_ia32_ps512_ps((__m128)__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi128_si512 (__m128i __A)
{
  return (__m512i) __builtin_ia32_si512_si((__v4si)__A);
}

extern __inline __m512d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castpd256_pd512 (__m256d __A)
{
  return __builtin_ia32_pd512_256pd (__A);
}

extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castps256_ps512 (__m256 __A)
{
  return __builtin_ia32_ps512_256ps (__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_castsi256_si512 (__m256i __A)
{
  return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
						     (__v16si) __B, 0,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
						     (__v16si) __B, 0, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
						    (__v8di) __B, 0, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
						    (__v8di) __B, 0,
						    (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
						     (__v16si) __B, 6,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
						     (__v16si) __B, 6,  __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
						    (__v8di) __B, 6, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B)
{
  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
						    (__v8di) __B, 6,
						    (__mmask8) -1);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1);		\
  __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0);		\
  __m256i __T3 = (__m256i) (__T1 op __T2);				\
  __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1);		\
  __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0);		\
  __v4si __T6 = __T4 op __T5;						\
  __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
  __v4si __T8 = __T6 op __T7;						\
  return __T8[0] op __T8[1]

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (+);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_mul_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (*);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_and_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (&);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_or_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (|);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi32 (__U, __A);
  __MM512_REDUCE_OP (+);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A);
  __MM512_REDUCE_OP (*);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
  __MM512_REDUCE_OP (&);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi32 (__U, __A);
  __MM512_REDUCE_OP (|);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1);		\
  __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0);		\
  __m256i __T3 = _mm256_##op (__T1, __T2);				\
  __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1);		\
  __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0);		\
  __m128i __T6 = _mm_##op (__T4, __T5);					\
  __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6,		\
					      (__v4si) { 2, 3, 0, 1 });	\
  __m128i __T8 = _mm_##op (__T6, __T7);					\
  __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8,		\
					      (__v4si) { 1, 0, 1, 0 });	\
  __v4si __T10 = (__v4si) _mm_##op (__T8, __T9);			\
  return __T10[0]

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (min_epi32);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_epi32 (__m512i __A)
{
  __MM512_REDUCE_OP (max_epi32);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_epu32 (__m512i __A)
{
  __MM512_REDUCE_OP (min_epu32);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_epu32 (__m512i __A)
{
  __MM512_REDUCE_OP (max_epu32);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A);
  __MM512_REDUCE_OP (min_epi32);
}

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A);
  __MM512_REDUCE_OP (max_epi32);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
  __MM512_REDUCE_OP (min_epu32);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi32 (__U, __A);
  __MM512_REDUCE_OP (max_epu32);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
  __m256 __T3 = __T1 op __T2;						\
  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);			\
  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);			\
  __m128 __T6 = __T4 op __T5;						\
  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
  __m128 __T8 = __T6 op __T7;						\
  return __T8[0] op __T8[1]

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_ps (__m512 __A)
{
  __MM512_REDUCE_OP (+);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_mul_ps (__m512 __A)
{
  __MM512_REDUCE_OP (*);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A)
{
  __A = _mm512_maskz_mov_ps (__U, __A);
  __MM512_REDUCE_OP (+);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A)
{
  __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A);
  __MM512_REDUCE_OP (*);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
  __m256 __T3 = _mm256_##op (__T1, __T2);				\
  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);			\
  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);			\
  __m128 __T6 = _mm_##op (__T4, __T5);					\
  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
  __m128 __T8 = _mm_##op (__T6, __T7);					\
  __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 });	\
  __m128 __T10 = _mm_##op (__T8, __T9);					\
  return __T10[0]

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_ps (__m512 __A)
{
  __MM512_REDUCE_OP (min_ps);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_ps (__m512 __A)
{
  __MM512_REDUCE_OP (max_ps);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A)
{
  __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A);
  __MM512_REDUCE_OP (min_ps);
}

extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A)
{
  __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A);
  __MM512_REDUCE_OP (max_ps);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1);		\
  __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0);		\
  __m256i __T3 = (__m256i) (__T1 op __T2);				\
  __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1);		\
  __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0);		\
  __v2di __T6 = __T4 op __T5;						\
  return __T6[0] op __T6[1]

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (+);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_mul_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (*);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_and_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (&);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_or_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (|);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi64 (__U, __A);
  __MM512_REDUCE_OP (+);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A);
  __MM512_REDUCE_OP (*);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
  __MM512_REDUCE_OP (&);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi64 (__U, __A);
  __MM512_REDUCE_OP (|);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e);			\
  __m512i __T2 = _mm512_##op (__A, __T1);				\
  __m512i __T3								\
    = (__m512i) __builtin_shuffle ((__v8di) __T2,			\
				   (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\
  __m512i __T4 = _mm512_##op (__T2, __T3);				\
  __m512i __T5								\
    = (__m512i) __builtin_shuffle ((__v8di) __T4,			\
				   (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\
  __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5);			\
  return __T6[0]

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (min_epi64);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_epi64 (__m512i __A)
{
  __MM512_REDUCE_OP (max_epi64);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__),
			       __U, __A);
  __MM512_REDUCE_OP (min_epi64);
}

extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1),
			       __U, __A);
  __MM512_REDUCE_OP (max_epi64);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_epu64 (__m512i __A)
{
  __MM512_REDUCE_OP (min_epu64);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_epu64 (__m512i __A)
{
  __MM512_REDUCE_OP (max_epu64);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
  __MM512_REDUCE_OP (min_epu64);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi64 (__U, __A);
  __MM512_REDUCE_OP (max_epu64);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);		\
  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);		\
  __m256d __T3 = __T1 op __T2;						\
  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);			\
  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);			\
  __m128d __T6 = __T4 op __T5;						\
  return __T6[0] op __T6[1]

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_pd (__m512d __A)
{
  __MM512_REDUCE_OP (+);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_mul_pd (__m512d __A)
{
  __MM512_REDUCE_OP (*);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A)
{
  __A = _mm512_maskz_mov_pd (__U, __A);
  __MM512_REDUCE_OP (+);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A)
{
  __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A);
  __MM512_REDUCE_OP (*);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);		\
  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);		\
  __m256d __T3 = _mm256_##op (__T1, __T2);				\
  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);			\
  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);			\
  __m128d __T6 = _mm_##op (__T4, __T5);					\
  __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 });	\
  __m128d __T8 = _mm_##op (__T6, __T7);					\
  return __T8[0]

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_pd (__m512d __A)
{
  __MM512_REDUCE_OP (min_pd);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_pd (__m512d __A)
{
  __MM512_REDUCE_OP (max_pd);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A)
{
  __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A);
  __MM512_REDUCE_OP (min_pd);
}

extern __inline double
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A)
{
  __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A);
  __MM512_REDUCE_OP (max_pd);
}

#undef __MM512_REDUCE_OP

#ifdef __DISABLE_AVX512F__
#undef __DISABLE_AVX512F__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512F__ */

#endif /* _AVX512FINTRIN_H_INCLUDED */
/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 10.0.  */

#ifndef _NMMINTRIN_H_INCLUDED
#define _NMMINTRIN_H_INCLUDED

/* We just include SSE4.1 header file.  */
#include <smmintrin.h>

#endif /* _NMMINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512VBMIINTRIN_H_INCLUDED
#define _AVX512VBMIINTRIN_H_INCLUDED

#ifndef __AVX512VBMI__
#pragma GCC push_options
#pragma GCC target("avx512vbmi")
#define __DISABLE_AVX512VBMI__
#endif /* __AVX512VBMI__ */

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
							  (__v64qi) __Y,
							  (__v64qi) __W,
							  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
							  (__v64qi) __Y,
							  (__v64qi)
							  _mm512_setzero_si512 (),
							  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
							  (__v64qi) __Y,
							  (__v64qi)
							  _mm512_undefined_epi32 (),
							  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
						     (__v64qi) __A,
						     (__v64qi)
						     _mm512_undefined_epi32 (),
						     (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
				__m512i __B)
{
  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
						     (__v64qi) __A,
						     (__v64qi)
						     _mm512_setzero_si512(),
						     (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
			       __m512i __B)
{
  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
						     (__v64qi) __A,
						     (__v64qi) __W,
						     (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
							/* idx */ ,
							(__v64qi) __A,
							(__v64qi) __B,
							(__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
				__m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
							/* idx */ ,
							(__v64qi) __A,
							(__v64qi) __B,
							(__mmask64)
							__U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
				 __mmask64 __U, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
							(__v64qi) __I
							/* idx */ ,
							(__v64qi) __B,
							(__mmask64)
							__U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
				 __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
							 /* idx */ ,
							 (__v64qi) __A,
							 (__v64qi) __B,
							 (__mmask64)
							 __U);
}

#ifdef __DISABLE_AVX512VBMI__
#undef __DISABLE_AVX512VBMI__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMI__ */

#endif /* _AVX512VBMIINTRIN_H_INCLUDED */
/* Exception handling and frame unwind runtime interface routines.
   Copyright (C) 2001-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful, but WITHOUT
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
   License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* This is derived from the C++ ABI for IA-64.  Where we diverge
   for cross-architecture compatibility are noted with "@@@".  */

#ifndef _UNWIND_H
#define _UNWIND_H

#if defined (__SEH__) && !defined (__USING_SJLJ_EXCEPTIONS__)
/* Only for _GCC_specific_handler.  */
#include <windows.h>
#endif

#ifndef HIDE_EXPORTS
#pragma GCC visibility push(default)
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* Level 1: Base ABI  */

/* @@@ The IA-64 ABI uses uint64 throughout.  Most places this is
   inefficient for 32-bit and smaller machines.  */
typedef unsigned _Unwind_Word __attribute__((__mode__(__unwind_word__)));
typedef signed _Unwind_Sword __attribute__((__mode__(__unwind_word__)));
#if defined(__ia64__) && defined(__hpux__)
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__word__)));
#else
typedef unsigned _Unwind_Ptr __attribute__((__mode__(__pointer__)));
#endif
typedef unsigned _Unwind_Internal_Ptr __attribute__((__mode__(__pointer__)));

/* @@@ The IA-64 ABI uses a 64-bit word to identify the producer and
   consumer of an exception.  We'll go along with this for now even on
   32-bit machines.  We'll need to provide some other option for
   16-bit machines and for machines with > 8 bits per byte.  */
typedef unsigned _Unwind_Exception_Class __attribute__((__mode__(__DI__)));

/* The unwind interface uses reason codes in several contexts to
   identify the reasons for failures or other actions.  */
typedef enum
{
  _URC_NO_REASON = 0,
  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
  _URC_FATAL_PHASE2_ERROR = 2,
  _URC_FATAL_PHASE1_ERROR = 3,
  _URC_NORMAL_STOP = 4,
  _URC_END_OF_STACK = 5,
  _URC_HANDLER_FOUND = 6,
  _URC_INSTALL_CONTEXT = 7,
  _URC_CONTINUE_UNWIND = 8
} _Unwind_Reason_Code;


/* The unwind interface uses a pointer to an exception header object
   as its representation of an exception being thrown. In general, the
   full representation of an exception object is language- and
   implementation-specific, but it will be prefixed by a header
   understood by the unwind interface.  */

struct _Unwind_Exception;

typedef void (*_Unwind_Exception_Cleanup_Fn) (_Unwind_Reason_Code,
					      struct _Unwind_Exception *);

struct _Unwind_Exception
{
  _Unwind_Exception_Class exception_class;
  _Unwind_Exception_Cleanup_Fn exception_cleanup;

#if !defined (__USING_SJLJ_EXCEPTIONS__) && defined (__SEH__)
  _Unwind_Word private_[6];
#else
  _Unwind_Word private_1;
  _Unwind_Word private_2;
#endif

  /* @@@ The IA-64 ABI says that this structure must be double-word aligned.
     Taking that literally does not make much sense generically.  Instead we
     provide the maximum alignment required by any type for the machine.  */
} __attribute__((__aligned__));


/* The ACTIONS argument to the personality routine is a bitwise OR of one
   or more of the following constants.  */
typedef int _Unwind_Action;

#define _UA_SEARCH_PHASE	1
#define _UA_CLEANUP_PHASE	2
#define _UA_HANDLER_FRAME	4
#define _UA_FORCE_UNWIND	8
#define _UA_END_OF_STACK	16

/* The target can override this macro to define any back-end-specific
   attributes required for the lowest-level stack frame.  */
#ifndef LIBGCC2_UNWIND_ATTRIBUTE
#define LIBGCC2_UNWIND_ATTRIBUTE
#endif

/* This is an opaque type used to refer to a system-specific data
   structure used by the system unwinder. This context is created and
   destroyed by the system, and passed to the personality routine
   during unwinding.  */
struct _Unwind_Context;

/* Raise an exception, passing along the given exception object.  */
extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_RaiseException (struct _Unwind_Exception *);

/* Raise an exception for forced unwinding.  */

typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)
     (int, _Unwind_Action, _Unwind_Exception_Class,
      struct _Unwind_Exception *, struct _Unwind_Context *, void *);

extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_ForcedUnwind (struct _Unwind_Exception *, _Unwind_Stop_Fn, void *);

/* Helper to invoke the exception_cleanup routine.  */
extern void _Unwind_DeleteException (struct _Unwind_Exception *);

/* Resume propagation of an existing exception.  This is used after
   e.g. executing cleanup code, and not to implement rethrowing.  */
extern void LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_Resume (struct _Unwind_Exception *);

/* @@@ Resume propagation of a FORCE_UNWIND exception, or to rethrow
   a normal exception that was handled.  */
extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_Resume_or_Rethrow (struct _Unwind_Exception *);

/* @@@ Use unwind data to perform a stack backtrace.  The trace callback
   is called for every stack frame in the call chain, but no cleanup
   actions are performed.  */
typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)
     (struct _Unwind_Context *, void *);

extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_Backtrace (_Unwind_Trace_Fn, void *);

/* These functions are used for communicating information about the unwind
   context (i.e. the unwind descriptors and the user register state) between
   the unwind library and the personality routine and landing pad.  Only
   selected registers may be manipulated.  */

extern _Unwind_Word _Unwind_GetGR (struct _Unwind_Context *, int);
extern void _Unwind_SetGR (struct _Unwind_Context *, int, _Unwind_Word);

extern _Unwind_Ptr _Unwind_GetIP (struct _Unwind_Context *);
extern _Unwind_Ptr _Unwind_GetIPInfo (struct _Unwind_Context *, int *);
extern void _Unwind_SetIP (struct _Unwind_Context *, _Unwind_Ptr);

/* @@@ Retrieve the CFA of the given context.  */
extern _Unwind_Word _Unwind_GetCFA (struct _Unwind_Context *);

extern void *_Unwind_GetLanguageSpecificData (struct _Unwind_Context *);

extern _Unwind_Ptr _Unwind_GetRegionStart (struct _Unwind_Context *);


/* The personality routine is the function in the C++ (or other language)
   runtime library which serves as an interface between the system unwind
   library and language-specific exception handling semantics.  It is
   specific to the code fragment described by an unwind info block, and
   it is always referenced via the pointer in the unwind info block, and
   hence it has no ABI-specified name.

   Note that this implies that two different C++ implementations can
   use different names, and have different contents in the language
   specific data area.  Moreover, that the language specific data
   area contains no version info because name of the function invoked
   provides more effective versioning by detecting at link time the
   lack of code to handle the different data format.  */

typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)
     (int, _Unwind_Action, _Unwind_Exception_Class,
      struct _Unwind_Exception *, struct _Unwind_Context *);

/* @@@ The following alternate entry points are for setjmp/longjmp
   based unwinding.  */

struct SjLj_Function_Context;
extern void _Unwind_SjLj_Register (struct SjLj_Function_Context *);
extern void _Unwind_SjLj_Unregister (struct SjLj_Function_Context *);

extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_SjLj_RaiseException (struct _Unwind_Exception *);
extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_SjLj_ForcedUnwind (struct _Unwind_Exception *, _Unwind_Stop_Fn, void *);
extern void LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_SjLj_Resume (struct _Unwind_Exception *);
extern _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
_Unwind_SjLj_Resume_or_Rethrow (struct _Unwind_Exception *);

/* @@@ The following provide access to the base addresses for text
   and data-relative addressing in the LDSA.  In order to stay link
   compatible with the standard ABI for IA-64, we inline these.  */

#ifdef __ia64__
static inline _Unwind_Ptr
_Unwind_GetDataRelBase (struct _Unwind_Context *_C)
{
  /* The GP is stored in R1.  */
  return _Unwind_GetGR (_C, 1);
}

static inline _Unwind_Ptr
_Unwind_GetTextRelBase (struct _Unwind_Context *_C __attribute__ ((__unused__)))
{
  __builtin_abort ();
  return 0;
}

/* @@@ Retrieve the Backing Store Pointer of the given context.  */
extern _Unwind_Word _Unwind_GetBSP (struct _Unwind_Context *);
#else
extern _Unwind_Ptr _Unwind_GetDataRelBase (struct _Unwind_Context *);
extern _Unwind_Ptr _Unwind_GetTextRelBase (struct _Unwind_Context *);
#endif

/* @@@ Given an address, return the entry point of the function that
   contains it.  */
extern void * _Unwind_FindEnclosingFunction (void *pc);

#ifndef __SIZEOF_LONG__
  #error "__SIZEOF_LONG__ macro not defined"
#endif

#ifndef __SIZEOF_POINTER__
  #error "__SIZEOF_POINTER__ macro not defined"
#endif


/* leb128 type numbers have a potentially unlimited size.
   The target of the following definitions of _sleb128_t and _uleb128_t
   is to have efficient data types large enough to hold the leb128 type
   numbers used in the unwind code.
   Mostly these types will simply be defined to long and unsigned long
   except when a unsigned long data type on the target machine is not
   capable of storing a pointer.  */

#if __SIZEOF_LONG__ >= __SIZEOF_POINTER__
  typedef long _sleb128_t;
  typedef unsigned long _uleb128_t;
#elif __SIZEOF_LONG_LONG__ >= __SIZEOF_POINTER__
  typedef long long _sleb128_t;
  typedef unsigned long long _uleb128_t;
#else
# error "What type shall we use for _sleb128_t?"
#endif

#if defined (__SEH__) && !defined (__USING_SJLJ_EXCEPTIONS__)
/* Handles the mapping from SEH to GCC interfaces.  */
EXCEPTION_DISPOSITION _GCC_specific_handler (PEXCEPTION_RECORD, void *,
					     PCONTEXT, PDISPATCHER_CONTEXT,
					     _Unwind_Personality_Fn);
#endif

#ifdef __cplusplus
}
#endif

#ifndef HIDE_EXPORTS
#pragma GCC visibility pop
#endif

/* Additional actions to unwind number of stack frames.  */
#define _Unwind_Frames_Extra(frames)

/* Increment frame count.  */
#define _Unwind_Frames_Increment(context, frames) frames++

#endif /* unwind.h */
/* Copyright (C) 2018-2020 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _PCONFIGINTRIN_H_INCLUDED
#define _PCONFIGINTRIN_H_INCLUDED

#ifndef __PCONFIG__
#pragma GCC push_options
#pragma GCC target("pconfig")
#define __DISABLE_PCONFIG__
#endif /* __PCONFIG__ */

#define __pconfig_b(leaf, b, retval)			\
  __asm__ __volatile__ ("pconfig\n\t"			\
	: "=a" (retval)					\
	: "a" (leaf), "b" (b)				\
	: "cc")

#define __pconfig_generic(leaf, b, c, d, retval)	\
  __asm__ __volatile__ ("pconfig\n\t"			\
	: "=a" (retval), "=b" (b), "=c" (c), "=d" (d)	\
	: "a" (leaf), "b" (b), "c" (c), "d" (d)		\
	: "cc")

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pconfig_u32 (const unsigned int __L, size_t __D[])
{
  enum __pconfig_type
  {
    __PCONFIG_KEY_PROGRAM = 0x01,
  };

  unsigned int __R = 0;

  if (!__builtin_constant_p (__L))
    __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
  else switch (__L)
    {
    case __PCONFIG_KEY_PROGRAM:
      __pconfig_b (__L, __D[0], __R);
      break;
    default:
      __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
    }
  return __R;
}

#ifdef __DISABLE_PCONFIG__
#undef __DISABLE_PCONFIG__
#pragma GCC pop_options
#endif /* __DISABLE_PCONFIG__ */

#endif /* _PCONFIGINTRIN_H_INCLUDED */
/* Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _IMMINTRIN_H_INCLUDED
# error "Never use <avx512vpopcntdqintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED
#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED

#ifndef __AVX512VPOPCNTDQ__
#pragma GCC push_options
#pragma GCC target("avx512vpopcntdq")
#define __DISABLE_AVX512VPOPCNTDQ__
#endif /* __AVX512VPOPCNTDQ__ */

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi32 (__m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountd_v16si ((__v16si) __A);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
							 (__v16si) __W,
							 (__mmask16) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi32 (__mmask16 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
							 (__v16si)
							 _mm512_setzero_si512 (),
							 (__mmask16) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi64 (__m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountq_v8di ((__v8di) __A);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
							(__v8di) __W,
							(__mmask8) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
							(__v8di)
							_mm512_setzero_si512 (),
							(__mmask8) __U);
}

#ifdef __DISABLE_AVX512VPOPCNTDQ__
#undef __DISABLE_AVX512VPOPCNTDQ__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VPOPCNTDQ__ */

#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */
/* Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _SGXINTRIN_H_INCLUDED
#define _SGXINTRIN_H_INCLUDED

#ifndef __SGX__
#pragma GCC push_options
#pragma GCC target("sgx")
#define __DISABLE_SGX__
#endif /* __SGX__ */

#define __encls_bc(leaf, b, c, retval)	    	 	\
  __asm__ __volatile__ ("encls\n\t"		     	\
	   : "=a" (retval)			     	\
	   : "a" (leaf), "b" (b), "c" (c)		\
	   : "cc")

#define __encls_bcd(leaf, b, c, d, retval)	     	\
  __asm__ __volatile__("encls\n\t"		     	\
	   : "=a" (retval)			     	\
	   : "a" (leaf), "b" (b), "c" (c), "d" (d)	\
	   : "cc")

#define __encls_c(leaf, c, retval)		     	\
  __asm__ __volatile__("encls\n\t"		     	\
	   : "=a" (retval)			     	\
	   : "a" (leaf), "c" (c)			\
	   : "cc")

#define __encls_edbgrd(leaf, b, c, retval)	     	\
  __asm__ __volatile__("encls\n\t"		     	\
	   : "=a" (retval), "=b" (b)		     	\
	   : "a" (leaf), "c" (c))

#define __encls_generic(leaf, b, c, d, retval)   	\
  __asm__ __volatile__("encls\n\t"		     	\
	   : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
	   : "a" (leaf), "b" (b), "c" (c), "d" (d)	\
	   : "cc")

#define __enclu_bc(leaf, b, c, retval)			\
  __asm__ __volatile__("enclu\n\t"			\
	   : "=a" (retval)				\
	   : "a" (leaf), "b" (b), "c" (c)		\
	   : "cc")

#define __enclu_bcd(leaf, b, c, d, retval)		\
  __asm__ __volatile__("enclu\n\t"			\
	   : "=a" (retval)				\
	   : "a" (leaf), "b" (b), "c" (c), "d" (d)	\
	   : "cc")

#define __enclu_eenter(leaf, b, c, retval)		\
  __asm__  __volatile__("enclu\n\t"			\
	   : "=a" (retval), "=c" (c)			\
	   : "a" (leaf), "b" (b), "c" (c)		\
	   : "cc")

#define __enclu_eexit(leaf, b, c, retval)		\
  __asm__  __volatile__("enclu\n\t"			\
	   : "=a" (retval), "=c" (c)			\
	   : "a" (leaf), "b" (b)			\
	   : "cc")

#define __enclu_generic(leaf, b, c, d, retval)		\
  __asm__ __volatile__("enclu\n\t"			\
	   : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
	   : "a" (leaf), "b" (b), "c" (c), "d" (d)	\
	   : "cc")

#define __enclv_bc(leaf, b, c, retval)			\
  __asm__ __volatile__("enclv\n\t"			\
	   : "=a" (retval)				\
	   : "a" (leaf), "b" (b), "c" (c)		\
	   : "cc")

#define __enclv_cd(leaf, c, d, retval)			\
  __asm__ __volatile__("enclv\n\t"			\
	   : "=a" (retval)				\
	   : "a" (leaf), "c" (c), "d" (d)		\
	   : "cc")

#define __enclv_generic(leaf, b, c, d, retval)		\
  __asm__ __volatile__("enclv\n\t"			\
	   : "=a" (retval), "=b" (b), "=c" (b), "=d" (d)\
	   : "a" (leaf), "b" (b), "c" (c), "d" (d)	\
	   : "cc")

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_encls_u32 (const unsigned int __L, size_t __D[])
{
  enum __encls_type
  {
    __SGX_ECREATE = 0x00,
    __SGX_EADD    = 0x01,
    __SGX_EINIT   = 0x02,
    __SGX_EREMOVE = 0x03,
    __SGX_EDBGRD  = 0x04,
    __SGX_EDBGWR  = 0x05,
    __SGX_EEXTEND = 0x06,
    __SGX_ELDB    = 0x07,
    __SGX_ELDU    = 0x08,
    __SGX_EBLOCK  = 0x09,
    __SGX_EPA     = 0x0A,
    __SGX_EWB     = 0x0B,
    __SGX_ETRACK  = 0x0C,
    __SGX_EAUG    = 0x0D,
    __SGX_EMODPR  = 0x0E,
    __SGX_EMODT   = 0x0F,
    __SGX_ERDINFO = 0x10,
    __SGX_ETRACKC = 0x11,
    __SGX_ELDBC   = 0x12,
    __SGX_ELDUC   = 0x13
  };
  enum __encls_type __T = (enum __encls_type)__L;
  unsigned int __R = 0;
  if (!__builtin_constant_p (__T))
    __encls_generic (__L, __D[0], __D[1], __D[2], __R);
  else switch (__T)
    {
    case __SGX_ECREATE:
    case __SGX_EADD:
    case __SGX_EDBGWR:
    case __SGX_EEXTEND:
    case __SGX_EPA:
    case __SGX_EMODPR:
    case __SGX_EMODT:
    case __SGX_EAUG:
    case __SGX_ERDINFO:
      __encls_bc (__L, __D[0], __D[1], __R);
      break;
    case __SGX_EINIT:
    case __SGX_ELDB:
    case __SGX_ELDU:
    case __SGX_EWB:
    case __SGX_ELDBC:
    case __SGX_ELDUC:
      __encls_bcd (__L, __D[0], __D[1], __D[2], __R);
      break;
    case __SGX_EREMOVE:
    case __SGX_EBLOCK:
    case __SGX_ETRACK:
    case __SGX_ETRACKC:
      __encls_c (__L, __D[1], __R);
      break;
    case __SGX_EDBGRD:
      __encls_edbgrd (__L, __D[0], __D[1], __R);
      break;
    default:
      __encls_generic (__L, __D[0], __D[1], __D[2], __R);
    }
  return __R;
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_enclu_u32 (const unsigned int __L, size_t __D[])
{
  enum __enclu_type
  {
    __SGX_EREPORT     = 0x00,
    __SGX_EGETKEY     = 0x01,
    __SGX_EENTER      = 0x02,
    __SGX_ERESUME     = 0x03,
    __SGX_EEXIT       = 0x04,
    __SGX_EACCEPT     = 0x05,
    __SGX_EMODPE      = 0x06,
    __SGX_EACCEPTCOPY = 0x07
  };
  enum __enclu_type __T = (enum __enclu_type) __L;
  unsigned int __R = 0;
  if (!__builtin_constant_p (__T))
    __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
  else switch (__T)
    {
    case __SGX_EREPORT:
    case __SGX_EACCEPTCOPY:
      __enclu_bcd (__L, __D[0], __D[1], __D[2], __R);
      break;
    case __SGX_EGETKEY:
    case __SGX_ERESUME:
    case __SGX_EACCEPT:
    case __SGX_EMODPE:
      __enclu_bc (__L, __D[0], __D[1], __R);
      break;
    case __SGX_EENTER:
      __enclu_eenter (__L, __D[0], __D[1], __R);
      break;
    case __SGX_EEXIT:
      __enclu_eexit (__L, __D[0], __D[1], __R);
      break;
    default:
      __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
    }
  return __R;
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_enclv_u32 (const unsigned int __L, size_t __D[])
{
  enum __enclv_type
  {
    __SGX_EDECVIRTCHILD = 0x00,
    __SGX_EINCVIRTCHILD = 0x01,
    __SGX_ESETCONTEXT   = 0x02
  };
  unsigned int __R = 0;
  if (!__builtin_constant_p (__L))
    __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
  else switch (__L)
    {
    case __SGX_EDECVIRTCHILD:
    case __SGX_EINCVIRTCHILD:
      __enclv_bc (__L, __D[0], __D[1], __R);
      break;
    case __SGX_ESETCONTEXT:
      __enclv_cd (__L, __D[1], __D[2], __R);
      break;
    default:
      __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
    }
  return __R;
}

#ifdef __DISABLE_SGX__
#undef __DISABLE_SGX__
#pragma GCC pop_options
#endif /* __DISABLE_SGX__ */

#endif /* _SGXINTRIN_H_INCLUDED */
/* Copyright (C) 1997-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
 * ISO C Standard:  7.9  Alternative spellings  <iso646.h>
 */

#ifndef _ISO646_H
#define _ISO646_H

#ifndef __cplusplus
#define and	&&
#define and_eq	&=
#define bitand	&
#define bitor	|
#define compl	~
#define not	!
#define not_eq	!=
#define or	||
#define or_eq	|=
#define xor	^
#define xor_eq	^=
#endif

#endif
/* Copyright (C) 2011-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* ISO C1X: 7.23 _Noreturn <stdnoreturn.h>.  */

#ifndef _STDNORETURN_H
#define _STDNORETURN_H

#ifndef __cplusplus

#define noreturn _Noreturn

#endif

#endif	/* stdnoreturn.h */
/*
 * Copyright (C) 2007-2018 Free Software Foundation, Inc.
 *
 * This file is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 3, or (at your option) any
 * later version.
 * 
 * This file is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * Under Section 7 of GPL version 3, you are granted additional
 * permissions described in the GCC Runtime Library Exception, version
 * 3.1, as published by the Free Software Foundation.
 * 
 * You should have received a copy of the GNU General Public License and
 * a copy of the GCC Runtime Library Exception along with this program;
 * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 * <http://www.gnu.org/licenses/>.
 */

/* %ecx */
#define bit_SSE3	(1 << 0)
#define bit_PCLMUL	(1 << 1)
#define bit_LZCNT	(1 << 5)
#define bit_SSSE3	(1 << 9)
#define bit_FMA		(1 << 12)
#define bit_CMPXCHG16B	(1 << 13)
#define bit_SSE4_1	(1 << 19)
#define bit_SSE4_2	(1 << 20)
#define bit_MOVBE	(1 << 22)
#define bit_POPCNT	(1 << 23)
#define bit_AES		(1 << 25)
#define bit_XSAVE	(1 << 26)
#define bit_OSXSAVE	(1 << 27)
#define bit_AVX		(1 << 28)
#define bit_F16C	(1 << 29)
#define bit_RDRND	(1 << 30)

/* %edx */
#define bit_CMPXCHG8B	(1 << 8)
#define bit_CMOV	(1 << 15)
#define bit_MMX		(1 << 23)
#define bit_FXSAVE	(1 << 24)
#define bit_SSE		(1 << 25)
#define bit_SSE2	(1 << 26)

/* Extended Features (%eax == 0x80000001) */
/* %ecx */
#define bit_LAHF_LM	(1 << 0)
#define bit_ABM		(1 << 5)
#define bit_SSE4a	(1 << 6)
#define bit_PRFCHW	(1 << 8)
#define bit_XOP         (1 << 11)
#define bit_LWP 	(1 << 15)
#define bit_FMA4        (1 << 16)
#define bit_TBM         (1 << 21)
#define bit_MWAITX      (1 << 29)

/* %edx */
#define bit_MMXEXT	(1 << 22)
#define bit_LM		(1 << 29)
#define bit_3DNOWP	(1 << 30)
#define bit_3DNOW	(1u << 31)

/* %ebx  */
#define bit_CLZERO	(1 << 0)
#define bit_WBNOINVD	(1 << 9)

/* Extended Features (%eax == 7) */
/* %ebx */
#define bit_FSGSBASE	(1 << 0)
#define bit_SGX (1 << 2)
#define bit_BMI	(1 << 3)
#define bit_HLE	(1 << 4)
#define bit_AVX2	(1 << 5)
#define bit_BMI2	(1 << 8)
#define bit_RTM	(1 << 11)
#define bit_MPX	(1 << 14)
#define bit_AVX512F	(1 << 16)
#define bit_AVX512DQ	(1 << 17)
#define bit_RDSEED	(1 << 18)
#define bit_ADX	(1 << 19)
#define bit_AVX512IFMA	(1 << 21)
#define bit_CLFLUSHOPT	(1 << 23)
#define bit_CLWB	(1 << 24)
#define bit_AVX512PF	(1 << 26)
#define bit_AVX512ER	(1 << 27)
#define bit_AVX512CD	(1 << 28)
#define bit_SHA		(1 << 29)
#define bit_AVX512BW	(1 << 30)
#define bit_AVX512VL	(1u << 31)

/* %ecx */
#define bit_PREFETCHWT1	  (1 << 0)
#define bit_AVX512VBMI	(1 << 1)
#define bit_PKU	(1 << 3)
#define bit_OSPKE	(1 << 4)
#define bit_AVX512VBMI2	(1 << 6)
#define bit_SHSTK	(1 << 7)
#define bit_GFNI	(1 << 8)
#define bit_VAES	(1 << 9)
#define bit_AVX512VNNI	(1 << 11)
#define bit_VPCLMULQDQ	(1 << 10)
#define bit_AVX512BITALG	(1 << 12)
#define bit_AVX512VPOPCNTDQ	(1 << 14)
#define bit_RDPID	(1 << 22)
#define bit_MOVDIRI	(1 << 27)
#define bit_MOVDIR64B	(1 << 28)

/* %edx */
#define bit_AVX5124VNNIW (1 << 2)
#define bit_AVX5124FMAPS (1 << 3)
#define bit_IBT	(1 << 20)
#define bit_PCONFIG	(1 << 18)
/* XFEATURE_ENABLED_MASK register bits (%eax == 13, %ecx == 0) */
#define bit_BNDREGS     (1 << 3)
#define bit_BNDCSR      (1 << 4)

/* Extended State Enumeration Sub-leaf (%eax == 13, %ecx == 1) */
#define bit_XSAVEOPT	(1 << 0)
#define bit_XSAVEC	(1 << 1)
#define bit_XSAVES	(1 << 3)

/* Signatures for different CPU implementations as returned in uses
   of cpuid with level 0.  */
#define signature_AMD_ebx	0x68747541
#define signature_AMD_ecx	0x444d4163
#define signature_AMD_edx	0x69746e65

#define signature_CENTAUR_ebx	0x746e6543
#define signature_CENTAUR_ecx	0x736c7561
#define signature_CENTAUR_edx	0x48727561

#define signature_CYRIX_ebx	0x69727943
#define signature_CYRIX_ecx	0x64616574
#define signature_CYRIX_edx	0x736e4978

#define signature_INTEL_ebx	0x756e6547
#define signature_INTEL_ecx	0x6c65746e
#define signature_INTEL_edx	0x49656e69

#define signature_TM1_ebx	0x6e617254
#define signature_TM1_ecx	0x55504361
#define signature_TM1_edx	0x74656d73

#define signature_TM2_ebx	0x756e6547
#define signature_TM2_ecx	0x3638784d
#define signature_TM2_edx	0x54656e69

#define signature_NSC_ebx	0x646f6547
#define signature_NSC_ecx	0x43534e20
#define signature_NSC_edx	0x79622065

#define signature_NEXGEN_ebx	0x4778654e
#define signature_NEXGEN_ecx	0x6e657669
#define signature_NEXGEN_edx	0x72446e65

#define signature_RISE_ebx	0x65736952
#define signature_RISE_ecx	0x65736952
#define signature_RISE_edx	0x65736952

#define signature_SIS_ebx	0x20536953
#define signature_SIS_ecx	0x20536953
#define signature_SIS_edx	0x20536953

#define signature_UMC_ebx	0x20434d55
#define signature_UMC_ecx	0x20434d55
#define signature_UMC_edx	0x20434d55

#define signature_VIA_ebx	0x20414956
#define signature_VIA_ecx	0x20414956
#define signature_VIA_edx	0x20414956

#define signature_VORTEX_ebx	0x74726f56
#define signature_VORTEX_ecx	0x436f5320
#define signature_VORTEX_edx	0x36387865

#ifndef __x86_64__
/* At least one cpu (Winchip 2) does not set %ebx and %ecx
   for cpuid leaf 1. Forcibly zero the two registers before
   calling cpuid as a precaution.  */
#define __cpuid(level, a, b, c, d)					\
  do {									\
    if (__builtin_constant_p (level) && (level) != 1)			\
      __asm__ __volatile__ ("cpuid\n\t"					\
			    : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
			    : "0" (level));				\
    else								\
      __asm__ __volatile__ ("cpuid\n\t"					\
			    : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
			    : "0" (level), "1" (0), "2" (0));		\
  } while (0)
#else
#define __cpuid(level, a, b, c, d)					\
  __asm__ __volatile__ ("cpuid\n\t"					\
			: "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
			: "0" (level))
#endif

#define __cpuid_count(level, count, a, b, c, d)				\
  __asm__ __volatile__ ("cpuid\n\t"					\
			: "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
			: "0" (level), "2" (count))


/* Return highest supported input value for cpuid instruction.  ext can
   be either 0x0 or 0x80000000 to return highest supported value for
   basic or extended cpuid information.  Function returns 0 if cpuid
   is not supported or whatever cpuid returns in eax register.  If sig
   pointer is non-null, then first four bytes of the signature
   (as found in ebx register) are returned in location pointed by sig.  */

static __inline unsigned int
__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
{
  unsigned int __eax, __ebx, __ecx, __edx;

#ifndef __x86_64__
  /* See if we can use cpuid.  On AMD64 we always can.  */
#if __GNUC__ >= 3
  __asm__ ("pushf{l|d}\n\t"
	   "pushf{l|d}\n\t"
	   "pop{l}\t%0\n\t"
	   "mov{l}\t{%0, %1|%1, %0}\n\t"
	   "xor{l}\t{%2, %0|%0, %2}\n\t"
	   "push{l}\t%0\n\t"
	   "popf{l|d}\n\t"
	   "pushf{l|d}\n\t"
	   "pop{l}\t%0\n\t"
	   "popf{l|d}\n\t"
	   : "=&r" (__eax), "=&r" (__ebx)
	   : "i" (0x00200000));
#else
/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
   nor alternatives in i386 code.  */
  __asm__ ("pushfl\n\t"
	   "pushfl\n\t"
	   "popl\t%0\n\t"
	   "movl\t%0, %1\n\t"
	   "xorl\t%2, %0\n\t"
	   "pushl\t%0\n\t"
	   "popfl\n\t"
	   "pushfl\n\t"
	   "popl\t%0\n\t"
	   "popfl\n\t"
	   : "=&r" (__eax), "=&r" (__ebx)
	   : "i" (0x00200000));
#endif

  if (!((__eax ^ __ebx) & 0x00200000))
    return 0;
#endif

  /* Host supports cpuid.  Return highest supported cpuid input value.  */
  __cpuid (__ext, __eax, __ebx, __ecx, __edx);

  if (__sig)
    *__sig = __ebx;

  return __eax;
}

/* Return cpuid data for requested cpuid leaf, as found in returned
   eax, ebx, ecx and edx registers.  The function checks if cpuid is
   supported and returns 1 for valid cpuid information or 0 for
   unsupported cpuid leaf.  All pointers are required to be non-null.  */

static __inline int
__get_cpuid (unsigned int __leaf,
	     unsigned int *__eax, unsigned int *__ebx,
	     unsigned int *__ecx, unsigned int *__edx)
{
  unsigned int __ext = __leaf & 0x80000000;
  unsigned int __maxlevel = __get_cpuid_max (__ext, 0);

  if (__maxlevel == 0 || __maxlevel < __leaf)
    return 0;

  __cpuid (__leaf, *__eax, *__ebx, *__ecx, *__edx);
  return 1;
}

/* Same as above, but sub-leaf can be specified.  */

static __inline int
__get_cpuid_count (unsigned int __leaf, unsigned int __subleaf,
		   unsigned int *__eax, unsigned int *__ebx,
		   unsigned int *__ecx, unsigned int *__edx)
{
  unsigned int __ext = __leaf & 0x80000000;
  unsigned int __maxlevel = __get_cpuid_max (__ext, 0);

  if (__maxlevel == 0 || __maxlevel < __leaf)
    return 0;

  __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
  return 1;
}
/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the AMD Programmers
   Manual Update, version 2.x */

#ifndef _AMMINTRIN_H_INCLUDED
#define _AMMINTRIN_H_INCLUDED

/* We need definitions from the SSE3, SSE2 and SSE header files*/
#include <pmmintrin.h>

#ifndef __SSE4A__
#pragma GCC push_options
#pragma GCC target("sse4a")
#define __DISABLE_SSE4A__
#endif /* __SSE4A__ */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_sd (double * __P, __m128d __Y)
{
  __builtin_ia32_movntsd (__P, (__v2df) __Y);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ss (float * __P, __m128 __Y)
{
  __builtin_ia32_movntss (__P, (__v4sf) __Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_si64 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
}

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L)
{
  return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L);
}
#else
#define _mm_extracti_si64(X, I, L)					\
  ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X),		\
				    (unsigned int)(I), (unsigned int)(L)))
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_si64 (__m128i __X,__m128i __Y)
{
  return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
}

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L)
{
  return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L);
}
#else
#define _mm_inserti_si64(X, Y, I, L)					\
  ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X),		\
				      (__v2di)(__m128i)(Y),		\
				      (unsigned int)(I), (unsigned int)(L)))
#endif

#ifdef __DISABLE_SSE4A__
#undef __DISABLE_SSE4A__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4A__ */

#endif /* _AMMINTRIN_H_INCLUDED */
/* Copyright (C) 2014-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED
# error "Never use <xsavecintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _XSAVECINTRIN_H_INCLUDED
#define _XSAVECINTRIN_H_INCLUDED

#ifndef __XSAVEC__
#pragma GCC push_options
#pragma GCC target("xsavec")
#define __DISABLE_XSAVEC__
#endif /* __XSAVEC__ */

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsavec (void *__P, long long __M)
{
  __builtin_ia32_xsavec (__P, __M);
}

#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsavec64 (void *__P, long long __M)
{
  __builtin_ia32_xsavec64 (__P, __M);
}
#endif

#ifdef __DISABLE_XSAVEC__
#undef __DISABLE_XSAVEC__
#pragma GCC pop_options
#endif /* __DISABLE_XSAVEC__ */

#endif /* _XSAVECINTRIN_H_INCLUDED */
/* Copyright (C) 2015-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _IMMINTRIN_H_INCLUDED
# error "Never use <avx5124fmapsintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED
#define _AVX5124FMAPSINTRIN_H_INCLUDED

#ifndef __AVX5124FMAPS__
#pragma GCC push_options
#pragma GCC target("avx5124fmaps")
#define __DISABLE_AVX5124FMAPS__
#endif /* __AVX5124FMAPS__ */

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C,
		  __m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B,
					   (__v16sf) __C,
					   (__v16sf) __D,
					   (__v16sf) __E,
					   (__v16sf) __A,
					   (const __v4sf *) __F);
}

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
		       __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
						(__v16sf) __C,
						(__v16sf) __D,
						(__v16sf) __E,
						(__v16sf) __A,
						(const __v4sf *) __F,
						(__v16sf) __A,
						(__mmask16) __U);
}

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4fmadd_ps (__mmask16 __U,
			__m512 __A, __m512 __B, __m512 __C,
			__m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
						(__v16sf) __C,
						(__v16sf) __D,
						(__v16sf) __E,
						(__v16sf) __A,
						(const __v4sf *) __F,
						(__v16sf) _mm512_setzero_ps (),
						(__mmask16) __U);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C,
	       __m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B,
					   (__v4sf) __C,
					   (__v4sf) __D,
					   (__v4sf) __E,
					   (__v4sf) __A,
					   (const __v4sf *) __F);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
		    __m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
						(__v4sf) __C,
						(__v4sf) __D,
						(__v4sf) __E,
						(__v4sf) __A,
						(const __v4sf *) __F,
						(__v4sf) __A,
						(__mmask8) __U);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
		     __m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
						(__v4sf) __C,
						(__v4sf) __D,
						(__v4sf) __E,
						(__v4sf) __A,
						(const __v4sf *) __F,
						(__v4sf) _mm_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C,
		   __m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B,
					    (__v16sf) __C,
					    (__v16sf) __D,
					    (__v16sf) __E,
					    (__v16sf) __A,
					    (const __v4sf *) __F);
}

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
			__m512 __C, __m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
						 (__v16sf) __C,
						 (__v16sf) __D,
						 (__v16sf) __E,
						 (__v16sf) __A,
						 (const __v4sf *) __F,
						 (__v16sf) __A,
						 (__mmask16) __U);
}

extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4fnmadd_ps (__mmask16 __U,
			 __m512 __A, __m512 __B, __m512 __C,
			 __m512 __D, __m512 __E, __m128 *__F)
{
  return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
						 (__v16sf) __C,
						 (__v16sf) __D,
						 (__v16sf) __E,
						 (__v16sf) __A,
						 (const __v4sf *) __F,
						 (__v16sf) _mm512_setzero_ps (),
						 (__mmask16) __U);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C,
		__m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B,
					    (__v4sf) __C,
					    (__v4sf) __D,
					    (__v4sf) __E,
					    (__v4sf) __A,
					    (const __v4sf *) __F);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
		     __m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
						 (__v4sf) __C,
						 (__v4sf) __D,
						 (__v4sf) __E,
						 (__v4sf) __A,
						 (const __v4sf *) __F,
						 (__v4sf) __A,
						 (__mmask8) __U);
}

extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
		      __m128 __D, __m128 __E, __m128 *__F)
{
  return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
						 (__v4sf) __C,
						 (__v4sf) __D,
						 (__v4sf) __E,
						 (__v4sf) __A,
						 (const __v4sf *) __F,
						 (__v4sf) _mm_setzero_ps (),
						 (__mmask8) __U);
}

#ifdef __DISABLE_AVX5124FMAPS__
#undef __DISABLE_AVX5124FMAPS__
#pragma GCC pop_options
#endif /* __DISABLE_AVX5124FMAPS__ */

#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */
/* Copyright (C) 1989-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
 * ISO C Standard:  7.17  Common definitions  <stddef.h>
 */
#if (!defined(_STDDEF_H) && !defined(_STDDEF_H_) && !defined(_ANSI_STDDEF_H) \
     && !defined(__STDDEF_H__)) \
    || defined(__need_wchar_t) || defined(__need_size_t) \
    || defined(__need_ptrdiff_t) || defined(__need_NULL) \
    || defined(__need_wint_t)

/* Any one of these symbols __need_* means that GNU libc
   wants us just to define one data type.  So don't define
   the symbols that indicate this file's entire job has been done.  */
#if (!defined(__need_wchar_t) && !defined(__need_size_t)	\
     && !defined(__need_ptrdiff_t) && !defined(__need_NULL)	\
     && !defined(__need_wint_t))
#define _STDDEF_H
#define _STDDEF_H_
/* snaroff@next.com says the NeXT needs this.  */
#define _ANSI_STDDEF_H
#endif

#ifndef __sys_stdtypes_h
/* This avoids lossage on SunOS but only if stdtypes.h comes first.
   There's no way to win with the other order!  Sun lossage.  */

/* On 4.3bsd-net2, make sure ansi.h is included, so we have
   one less case to deal with in the following.  */
#if defined (__BSD_NET2__) || defined (____386BSD____) || (defined (__FreeBSD__) && (__FreeBSD__ < 5)) || defined(__NetBSD__)
#include <machine/ansi.h>
#endif
/* On FreeBSD 5, machine/ansi.h does not exist anymore... */
#if defined (__FreeBSD__) && (__FreeBSD__ >= 5)
#include <sys/_types.h>
#endif

/* In 4.3bsd-net2, machine/ansi.h defines these symbols, which are
   defined if the corresponding type is *not* defined.
   FreeBSD-2.1 defines _MACHINE_ANSI_H_ instead of _ANSI_H_.
   NetBSD defines _I386_ANSI_H_ and _X86_64_ANSI_H_ instead of _ANSI_H_ */
#if defined(_ANSI_H_) || defined(_MACHINE_ANSI_H_) || defined(_X86_64_ANSI_H_)  || defined(_I386_ANSI_H_)
#if !defined(_SIZE_T_) && !defined(_BSD_SIZE_T_)
#define _SIZE_T
#endif
#if !defined(_PTRDIFF_T_) && !defined(_BSD_PTRDIFF_T_)
#define _PTRDIFF_T
#endif
/* On BSD/386 1.1, at least, machine/ansi.h defines _BSD_WCHAR_T_
   instead of _WCHAR_T_. */
#if !defined(_WCHAR_T_) && !defined(_BSD_WCHAR_T_)
#ifndef _BSD_WCHAR_T_
#define _WCHAR_T
#endif
#endif
/* Undef _FOO_T_ if we are supposed to define foo_t.  */
#if defined (__need_ptrdiff_t) || defined (_STDDEF_H_)
#undef _PTRDIFF_T_
#undef _BSD_PTRDIFF_T_
#endif
#if defined (__need_size_t) || defined (_STDDEF_H_)
#undef _SIZE_T_
#undef _BSD_SIZE_T_
#endif
#if defined (__need_wchar_t) || defined (_STDDEF_H_)
#undef _WCHAR_T_
#undef _BSD_WCHAR_T_
#endif
#endif /* defined(_ANSI_H_) || defined(_MACHINE_ANSI_H_) || defined(_X86_64_ANSI_H_) || defined(_I386_ANSI_H_) */

/* Sequent's header files use _PTRDIFF_T_ in some conflicting way.
   Just ignore it.  */
#if defined (__sequent__) && defined (_PTRDIFF_T_)
#undef _PTRDIFF_T_
#endif

/* On VxWorks, <type/vxTypesBase.h> may have defined macros like
   _TYPE_size_t which will typedef size_t.  fixincludes patched the
   vxTypesBase.h so that this macro is only defined if _GCC_SIZE_T is
   not defined, and so that defining this macro defines _GCC_SIZE_T.
   If we find that the macros are still defined at this point, we must
   invoke them so that the type is defined as expected.  */
#if defined (_TYPE_ptrdiff_t) && (defined (__need_ptrdiff_t) || defined (_STDDEF_H_))
_TYPE_ptrdiff_t;
#undef _TYPE_ptrdiff_t
#endif
#if defined (_TYPE_size_t) && (defined (__need_size_t) || defined (_STDDEF_H_))
_TYPE_size_t;
#undef _TYPE_size_t
#endif
#if defined (_TYPE_wchar_t) && (defined (__need_wchar_t) || defined (_STDDEF_H_))
_TYPE_wchar_t;
#undef _TYPE_wchar_t
#endif

/* In case nobody has defined these types, but we aren't running under
   GCC 2.00, make sure that __PTRDIFF_TYPE__, __SIZE_TYPE__, and
   __WCHAR_TYPE__ have reasonable values.  This can happen if the
   parts of GCC is compiled by an older compiler, that actually
   include gstddef.h, such as collect2.  */

/* Signed type of difference of two pointers.  */

/* Define this type if we are doing the whole job,
   or if we want this type in particular.  */
#if defined (_STDDEF_H) || defined (__need_ptrdiff_t)
#ifndef _PTRDIFF_T	/* in case <sys/types.h> has defined it. */
#ifndef _T_PTRDIFF_
#ifndef _T_PTRDIFF
#ifndef __PTRDIFF_T
#ifndef _PTRDIFF_T_
#ifndef _BSD_PTRDIFF_T_
#ifndef ___int_ptrdiff_t_h
#ifndef _GCC_PTRDIFF_T
#ifndef _PTRDIFF_T_DECLARED /* DragonFly */
#define _PTRDIFF_T
#define _T_PTRDIFF_
#define _T_PTRDIFF
#define __PTRDIFF_T
#define _PTRDIFF_T_
#define _BSD_PTRDIFF_T_
#define ___int_ptrdiff_t_h
#define _GCC_PTRDIFF_T
#define _PTRDIFF_T_DECLARED
#ifndef __PTRDIFF_TYPE__
#define __PTRDIFF_TYPE__ long int
#endif
typedef __PTRDIFF_TYPE__ ptrdiff_t;
#endif /* _PTRDIFF_T_DECLARED */
#endif /* _GCC_PTRDIFF_T */
#endif /* ___int_ptrdiff_t_h */
#endif /* _BSD_PTRDIFF_T_ */
#endif /* _PTRDIFF_T_ */
#endif /* __PTRDIFF_T */
#endif /* _T_PTRDIFF */
#endif /* _T_PTRDIFF_ */
#endif /* _PTRDIFF_T */

/* If this symbol has done its job, get rid of it.  */
#undef	__need_ptrdiff_t

#endif /* _STDDEF_H or __need_ptrdiff_t.  */

/* Unsigned type of `sizeof' something.  */

/* Define this type if we are doing the whole job,
   or if we want this type in particular.  */
#if defined (_STDDEF_H) || defined (__need_size_t)
#ifndef __size_t__	/* BeOS */
#ifndef __SIZE_T__	/* Cray Unicos/Mk */
#ifndef _SIZE_T	/* in case <sys/types.h> has defined it. */
#ifndef _SYS_SIZE_T_H
#ifndef _T_SIZE_
#ifndef _T_SIZE
#ifndef __SIZE_T
#ifndef _SIZE_T_
#ifndef _BSD_SIZE_T_
#ifndef _SIZE_T_DEFINED_
#ifndef _SIZE_T_DEFINED
#ifndef _BSD_SIZE_T_DEFINED_	/* Darwin */
#ifndef _SIZE_T_DECLARED	/* FreeBSD 5 */
#ifndef ___int_size_t_h
#ifndef _GCC_SIZE_T
#ifndef _SIZET_
#ifndef __size_t
#define __size_t__	/* BeOS */
#define __SIZE_T__	/* Cray Unicos/Mk */
#define _SIZE_T
#define _SYS_SIZE_T_H
#define _T_SIZE_
#define _T_SIZE
#define __SIZE_T
#define _SIZE_T_
#define _BSD_SIZE_T_
#define _SIZE_T_DEFINED_
#define _SIZE_T_DEFINED
#define _BSD_SIZE_T_DEFINED_	/* Darwin */
#define _SIZE_T_DECLARED	/* FreeBSD 5 */
#define ___int_size_t_h
#define _GCC_SIZE_T
#define _SIZET_
#if (defined (__FreeBSD__) && (__FreeBSD__ >= 5)) \
  || defined(__DragonFly__) \
  || defined(__FreeBSD_kernel__)
/* __size_t is a typedef on FreeBSD 5, must not trash it. */
#elif defined (__VMS__)
/* __size_t is also a typedef on VMS.  */
#else
#define __size_t
#endif
#ifndef __SIZE_TYPE__
#define __SIZE_TYPE__ long unsigned int
#endif
#if !(defined (__GNUG__) && defined (size_t))
typedef __SIZE_TYPE__ size_t;
#ifdef __BEOS__
typedef long ssize_t;
#endif /* __BEOS__ */
#endif /* !(defined (__GNUG__) && defined (size_t)) */
#endif /* __size_t */
#endif /* _SIZET_ */
#endif /* _GCC_SIZE_T */
#endif /* ___int_size_t_h */
#endif /* _SIZE_T_DECLARED */
#endif /* _BSD_SIZE_T_DEFINED_ */
#endif /* _SIZE_T_DEFINED */
#endif /* _SIZE_T_DEFINED_ */
#endif /* _BSD_SIZE_T_ */
#endif /* _SIZE_T_ */
#endif /* __SIZE_T */
#endif /* _T_SIZE */
#endif /* _T_SIZE_ */
#endif /* _SYS_SIZE_T_H */
#endif /* _SIZE_T */
#endif /* __SIZE_T__ */
#endif /* __size_t__ */
#undef	__need_size_t
#endif /* _STDDEF_H or __need_size_t.  */


/* Wide character type.
   Locale-writers should change this as necessary to
   be big enough to hold unique values not between 0 and 127,
   and not (wchar_t) -1, for each defined multibyte character.  */

/* Define this type if we are doing the whole job,
   or if we want this type in particular.  */
#if defined (_STDDEF_H) || defined (__need_wchar_t)
#ifndef __wchar_t__	/* BeOS */
#ifndef __WCHAR_T__	/* Cray Unicos/Mk */
#ifndef _WCHAR_T
#ifndef _T_WCHAR_
#ifndef _T_WCHAR
#ifndef __WCHAR_T
#ifndef _WCHAR_T_
#ifndef _BSD_WCHAR_T_
#ifndef _BSD_WCHAR_T_DEFINED_    /* Darwin */
#ifndef _BSD_RUNE_T_DEFINED_	/* Darwin */
#ifndef _WCHAR_T_DECLARED /* FreeBSD 5 */
#ifndef _WCHAR_T_DEFINED_
#ifndef _WCHAR_T_DEFINED
#ifndef _WCHAR_T_H
#ifndef ___int_wchar_t_h
#ifndef __INT_WCHAR_T_H
#ifndef _GCC_WCHAR_T
#define __wchar_t__	/* BeOS */
#define __WCHAR_T__	/* Cray Unicos/Mk */
#define _WCHAR_T
#define _T_WCHAR_
#define _T_WCHAR
#define __WCHAR_T
#define _WCHAR_T_
#define _BSD_WCHAR_T_
#define _WCHAR_T_DEFINED_
#define _WCHAR_T_DEFINED
#define _WCHAR_T_H
#define ___int_wchar_t_h
#define __INT_WCHAR_T_H
#define _GCC_WCHAR_T
#define _WCHAR_T_DECLARED

/* On BSD/386 1.1, at least, machine/ansi.h defines _BSD_WCHAR_T_
   instead of _WCHAR_T_, and _BSD_RUNE_T_ (which, unlike the other
   symbols in the _FOO_T_ family, stays defined even after its
   corresponding type is defined).  If we define wchar_t, then we
   must undef _WCHAR_T_; for BSD/386 1.1 (and perhaps others), if
   we undef _WCHAR_T_, then we must also define rune_t, since 
   headers like runetype.h assume that if machine/ansi.h is included,
   and _BSD_WCHAR_T_ is not defined, then rune_t is available.
   machine/ansi.h says, "Note that _WCHAR_T_ and _RUNE_T_ must be of
   the same type." */
#ifdef _BSD_WCHAR_T_
#undef _BSD_WCHAR_T_
#ifdef _BSD_RUNE_T_
#if !defined (_ANSI_SOURCE) && !defined (_POSIX_SOURCE)
typedef _BSD_RUNE_T_ rune_t;
#define _BSD_WCHAR_T_DEFINED_
#define _BSD_RUNE_T_DEFINED_	/* Darwin */
#if defined (__FreeBSD__) && (__FreeBSD__ < 5)
/* Why is this file so hard to maintain properly?  In contrast to
   the comment above regarding BSD/386 1.1, on FreeBSD for as long
   as the symbol has existed, _BSD_RUNE_T_ must not stay defined or
   redundant typedefs will occur when stdlib.h is included after this file. */
#undef _BSD_RUNE_T_
#endif
#endif
#endif
#endif
/* FreeBSD 5 can't be handled well using "traditional" logic above
   since it no longer defines _BSD_RUNE_T_ yet still desires to export
   rune_t in some cases... */
#if defined (__FreeBSD__) && (__FreeBSD__ >= 5)
#if !defined (_ANSI_SOURCE) && !defined (_POSIX_SOURCE)
#if __BSD_VISIBLE
#ifndef _RUNE_T_DECLARED
typedef __rune_t        rune_t;
#define _RUNE_T_DECLARED
#endif
#endif
#endif
#endif

#ifndef __WCHAR_TYPE__
#define __WCHAR_TYPE__ int
#endif
#ifndef __cplusplus
typedef __WCHAR_TYPE__ wchar_t;
#endif
#endif
#endif
#endif
#endif
#endif
#endif
#endif /* _WCHAR_T_DECLARED */
#endif /* _BSD_RUNE_T_DEFINED_ */
#endif
#endif
#endif
#endif
#endif
#endif
#endif
#endif /* __WCHAR_T__ */
#endif /* __wchar_t__ */
#undef	__need_wchar_t
#endif /* _STDDEF_H or __need_wchar_t.  */

#if defined (__need_wint_t)
#ifndef _WINT_T
#define _WINT_T

#ifndef __WINT_TYPE__
#define __WINT_TYPE__ unsigned int
#endif
typedef __WINT_TYPE__ wint_t;
#endif
#undef __need_wint_t
#endif

/*  In 4.3bsd-net2, leave these undefined to indicate that size_t, etc.
    are already defined.  */
/*  BSD/OS 3.1 and FreeBSD [23].x require the MACHINE_ANSI_H check here.  */
/*  NetBSD 5 requires the I386_ANSI_H and X86_64_ANSI_H checks here.  */
#if defined(_ANSI_H_) || defined(_MACHINE_ANSI_H_) || defined(_X86_64_ANSI_H_) || defined(_I386_ANSI_H_)
/*  The references to _GCC_PTRDIFF_T_, _GCC_SIZE_T_, and _GCC_WCHAR_T_
    are probably typos and should be removed before 2.8 is released.  */
#ifdef _GCC_PTRDIFF_T_
#undef _PTRDIFF_T_
#undef _BSD_PTRDIFF_T_
#endif
#ifdef _GCC_SIZE_T_
#undef _SIZE_T_
#undef _BSD_SIZE_T_
#endif
#ifdef _GCC_WCHAR_T_
#undef _WCHAR_T_
#undef _BSD_WCHAR_T_
#endif
/*  The following ones are the real ones.  */
#ifdef _GCC_PTRDIFF_T
#undef _PTRDIFF_T_
#undef _BSD_PTRDIFF_T_
#endif
#ifdef _GCC_SIZE_T
#undef _SIZE_T_
#undef _BSD_SIZE_T_
#endif
#ifdef _GCC_WCHAR_T
#undef _WCHAR_T_
#undef _BSD_WCHAR_T_
#endif
#endif /* _ANSI_H_ || _MACHINE_ANSI_H_ || _X86_64_ANSI_H_ || _I386_ANSI_H_ */

#endif /* __sys_stdtypes_h */

/* A null pointer constant.  */

#if defined (_STDDEF_H) || defined (__need_NULL)
#undef NULL		/* in case <stdio.h> has defined it. */
#ifdef __GNUG__
#define NULL __null
#else   /* G++ */
#ifndef __cplusplus
#define NULL ((void *)0)
#else   /* C++ */
#define NULL 0
#endif  /* C++ */
#endif  /* G++ */
#endif	/* NULL not defined and <stddef.h> or need NULL.  */
#undef	__need_NULL

#ifdef _STDDEF_H

/* Offset of member MEMBER in a struct of type TYPE. */
#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)

#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
  || (defined(__cplusplus) && __cplusplus >= 201103L)
#ifndef _GCC_MAX_ALIGN_T
#define _GCC_MAX_ALIGN_T
/* Type whose alignment is supported in every context and is at least
   as great as that of any standard type not using alignment
   specifiers.  */
typedef struct {
  long long __max_align_ll __attribute__((__aligned__(__alignof__(long long))));
  long double __max_align_ld __attribute__((__aligned__(__alignof__(long double))));
  /* _Float128 is defined as a basic type, so max_align_t must be
     sufficiently aligned for it.  This code must work in C++, so we
     use __float128 here; that is only available on some
     architectures, but only on i386 is extra alignment needed for
     __float128.  */
#ifdef __i386__
  __float128 __max_align_f128 __attribute__((__aligned__(__alignof(__float128))));
#endif
} max_align_t;
#endif
#endif /* C11 or C++11.  */

#if defined(__cplusplus) && __cplusplus >= 201103L
#ifndef _GXX_NULLPTR_T
#define _GXX_NULLPTR_T
  typedef decltype(nullptr) nullptr_t;
#endif
#endif /* C++11.  */

#endif /* _STDDEF_H was defined this time */

#endif /* !_STDDEF_H && !_STDDEF_H_ && !_ANSI_STDDEF_H && !__STDDEF_H__
	  || __need_XXX was not defined before */
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
# error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _XTESTINTRIN_H_INCLUDED
#define _XTESTINTRIN_H_INCLUDED

#ifndef __RTM__
#pragma GCC push_options
#pragma GCC target("rtm")
#define __DISABLE_RTM__
#endif /* __RTM__ */

/* Return non-zero if the instruction executes inside an RTM or HLE code
   region.  Return zero otherwise.   */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xtest (void)
{
  return __builtin_ia32_xtest ();
}

#ifdef __DISABLE_RTM__
#undef __DISABLE_RTM__
#pragma GCC pop_options
#endif /* __DISABLE_RTM__ */

#endif /* _XTESTINTRIN_H_INCLUDED */
#ifndef _VARARGS_H
#define _VARARGS_H

#error "GCC no longer implements <varargs.h>."
#error "Revise your code to use <stdarg.h>."

#endif
/* Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef __VAESINTRIN_H_INCLUDED
#define __VAESINTRIN_H_INCLUDED

#if !defined(__VAES__) || !defined(__AVX__)
#pragma GCC push_options
#pragma GCC target("vaes,avx")
#define __DISABLE_VAES__
#endif /* __VAES__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_aesdec_epi128 (__m256i __A, __m256i __B)
{
  return (__m256i)__builtin_ia32_vaesdec_v32qi ((__v32qi) __A, (__v32qi) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_aesdeclast_epi128 (__m256i __A, __m256i __B)
{
  return (__m256i)__builtin_ia32_vaesdeclast_v32qi ((__v32qi) __A,
								(__v32qi) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_aesenc_epi128 (__m256i __A, __m256i __B)
{
  return (__m256i)__builtin_ia32_vaesenc_v32qi ((__v32qi) __A, (__v32qi) __B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_aesenclast_epi128 (__m256i __A, __m256i __B)
{
  return (__m256i)__builtin_ia32_vaesenclast_v32qi ((__v32qi) __A,
								(__v32qi) __B);
}

#ifdef __DISABLE_VAES__
#undef __DISABLE_VAES__
#pragma GCC pop_options
#endif /* __DISABLE_VAES__ */


#if !defined(__VAES__) || !defined(__AVX512F__)
#pragma GCC push_options
#pragma GCC target("vaes,avx512f")
#define __DISABLE_VAESF__
#endif /* __VAES__ */


extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_aesdec_epi128 (__m512i __A, __m512i __B)
{
  return (__m512i)__builtin_ia32_vaesdec_v64qi ((__v64qi) __A, (__v64qi) __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_aesdeclast_epi128 (__m512i __A, __m512i __B)
{
  return (__m512i)__builtin_ia32_vaesdeclast_v64qi ((__v64qi) __A,
						    (__v64qi) __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_aesenc_epi128 (__m512i __A, __m512i __B)
{
  return (__m512i)__builtin_ia32_vaesenc_v64qi ((__v64qi) __A, (__v64qi) __B);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_aesenclast_epi128 (__m512i __A, __m512i __B)
{
  return (__m512i)__builtin_ia32_vaesenclast_v64qi ((__v64qi) __A,
						    (__v64qi) __B);
}

#ifdef __DISABLE_VAESF__
#undef __DISABLE_VAESF__
#pragma GCC pop_options
#endif /* __DISABLE_VAES__ */

#if !defined(__VAES__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("vaes,avx512vl")
#define __DISABLE_VAESVL__
#endif /* __VAES__ */

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_aesdec_epi128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_vaesdec_v16qi ((__v16qi) __A, (__v16qi) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_aesdeclast_epi128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_vaesdeclast_v16qi ((__v16qi) __A,
						    (__v16qi) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_aesenc_epi128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_vaesenc_v16qi ((__v16qi) __A, (__v16qi) __B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_aesenclast_epi128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_vaesenclast_v16qi ((__v16qi) __A,
						    (__v16qi) __B);
}

#ifdef __DISABLE_VAESVL__
#undef __DISABLE_VAESVL__
#pragma GCC pop_options
#endif /* __DISABLE_VAES__ */
#endif /* __VAESINTRIN_H_INCLUDED */
/* Copyright (C) 2004-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with
   MSVC 7.1.  */

#ifndef _MM3DNOW_H_INCLUDED
#define _MM3DNOW_H_INCLUDED

#include <mmintrin.h>
#include <prfchwintrin.h>

#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__
#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("sse,3dnow")
#else
#pragma GCC target("3dnow")
#endif
#define __DISABLE_3dNOW__
#endif /* __3dNOW__ */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_femms (void)
{
  __builtin_ia32_femms();
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgusb (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pf2id (__m64 __A)
{
  return (__m64)__builtin_ia32_pf2id ((__v2sf)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfadd (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpeq (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpge (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpgt (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmax (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmin (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmul (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcp (__m64 __A)
{
  return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcpit1 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcpit2 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrsqrt (__m64 __A)
{
  return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrsqit1 (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfsub (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfsubr (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pi2fd (__m64 __A)
{
  return (__m64)__builtin_ia32_pi2fd ((__v2si)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhrw (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B);
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_prefetch (void *__P)
{
  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_float (float __A)
{
  return __extension__ (__m64)(__v2sf){ __A, 0.0f };
}

extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_float (__m64 __A)
{
  union { __v2sf v; float a[2]; } __tmp;
  __tmp.v = (__v2sf)__A;
  return __tmp.a[0];
}

#ifdef __DISABLE_3dNOW__
#undef __DISABLE_3dNOW__
#pragma GCC pop_options
#endif /* __DISABLE_3dNOW__ */

#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__
#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("sse,3dnowa")
#else
#pragma GCC target("3dnowa")
#endif
#define __DISABLE_3dNOW_A__
#endif /* __3dNOW_A__ */

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pf2iw (__m64 __A)
{
  return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfnacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfpnacc (__m64 __A, __m64 __B)
{
  return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pi2fw (__m64 __A)
{
  return (__m64)__builtin_ia32_pi2fw ((__v2si)__A);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pswapd (__m64 __A)
{
  return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A);
}

#ifdef __DISABLE_3dNOW_A__
#undef __DISABLE_3dNOW_A__
#pragma GCC pop_options
#endif /* __DISABLE_3dNOW_A__ */

#endif /* _MM3DNOW_H_INCLUDED */
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* #if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED */
/* # error "Never use <xsaveoptintrin.h> directly; include <x86intrin.h> instead." */
/* #endif */

#ifndef _XSAVEOPTINTRIN_H_INCLUDED
#define _XSAVEOPTINTRIN_H_INCLUDED

#ifndef __XSAVEOPT__
#pragma GCC push_options
#pragma GCC target("xsaveopt")
#define __DISABLE_XSAVEOPT__
#endif /* __XSAVEOPT__ */

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsaveopt (void *__P, long long __M)
{
  __builtin_ia32_xsaveopt (__P, __M);
}

#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xsaveopt64 (void *__P, long long __M)
{
  __builtin_ia32_xsaveopt64 (__P, __M);
}
#endif

#ifdef __DISABLE_XSAVEOPT__
#undef __DISABLE_XSAVEOPT__
#pragma GCC pop_options
#endif /* __DISABLE_XSAVEOPT__ */

#endif /* _XSAVEOPTINTRIN_H_INCLUDED */
/* Copyright (C) 2011-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
# error "Never use <f16intrin.h> directly; include <x86intrin.h> or <immintrin.h> instead."
#endif

#ifndef _F16CINTRIN_H_INCLUDED
#define _F16CINTRIN_H_INCLUDED

#ifndef __F16C__
#pragma GCC push_options
#pragma GCC target("f16c")
#define __DISABLE_F16C__
#endif /* __F16C__ */

extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtsh_ss (unsigned short __S)
{
  __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 };
  __v4sf __A = __builtin_ia32_vcvtph2ps (__H);
  return __builtin_ia32_vec_ext_v4sf (__A, 0);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtph_ps (__m128i __A)
{
  return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtph_ps (__m128i __A)
{
  return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A);
}

#ifdef __OPTIMIZE__
extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtss_sh (float __F, const int __I)
{
  __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };
  __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
  return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_ph (__m128 __A, const int __I)
{
  return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_ph (__m256 __A, const int __I)
{
  return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I);
}
#else
#define _cvtss_sh(__F, __I)						\
  (__extension__ 							\
   ({									\
      __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };		\
      __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);			\
      (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);		\
    }))

#define _mm_cvtps_ph(A, I) \
  ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) (A), (int) (I)))

#define _mm256_cvtps_ph(A, I) \
  ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) (A), (int) (I)))
#endif /* __OPTIMIZE */

#ifdef __DISABLE_F16C__
#undef __DISABLE_F16C__
#pragma GCC pop_options
#endif /* __DISABLE_F16C__ */

#endif /* _F16CINTRIN_H_INCLUDED */
/* Copyright (C) 2006-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.1.  */

#ifndef _TMMINTRIN_H_INCLUDED
#define _TMMINTRIN_H_INCLUDED

/* We need definitions from the SSE3, SSE2 and SSE header files*/
#include <pmmintrin.h>

#ifndef __SSSE3__
#pragma GCC push_options
#pragma GCC target("ssse3")
#define __DISABLE_SSSE3__
#endif /* __SSSE3__ */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32 (__m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32 (__m64 __X, __m64 __Y)
{
  return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y);
}

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
{
  return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X,
					      (__v2di)__Y, __N * 8);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N)
{
  return (__m64) __builtin_ia32_palignr ((__v1di)__X,
					 (__v1di)__Y, __N * 8);
}
#else
#define _mm_alignr_epi8(X, Y, N)					\
  ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X),		\
					(__v2di)(__m128i)(Y),		\
					(int)(N) * 8))
#define _mm_alignr_pi8(X, Y, N)						\
  ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X),			\
				   (__v1di)(__m64)(Y),			\
				   (int)(N) * 8))
#endif

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32 (__m128i __X)
{
  return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8 (__m64 __X)
{
  return (__m64) __builtin_ia32_pabsb ((__v8qi)__X);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16 (__m64 __X)
{
  return (__m64) __builtin_ia32_pabsw ((__v4hi)__X);
}

extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32 (__m64 __X)
{
  return (__m64) __builtin_ia32_pabsd ((__v2si)__X);
}

#ifdef __DISABLE_SSSE3__
#undef __DISABLE_SSSE3__
#pragma GCC pop_options
#endif /* __DISABLE_SSSE3__ */

#endif /* _TMMINTRIN_H_INCLUDED */
/* Copyright (C) 2018-2020 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _WBNOINVDINTRIN_H_INCLUDED
#define _WBNOINVDINTRIN_H_INCLUDED

#ifndef __WBNOINVD__
#pragma GCC push_options
#pragma GCC target("wbnoinvd")
#define __DISABLE_WBNOINVD__
#endif /* __WBNOINVD__ */

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wbnoinvd (void)
{
  __builtin_ia32_wbnoinvd ();
}

#ifdef __DISABLE_WBNOINVD__
#undef __DISABLE_WBNOINVD__
#pragma GCC pop_options
#endif /* __DISABLE_WBNOINVD__ */

#endif /* _WBNOINVDINTRIN_H_INCLUDED */
/* Copyright (C) 2012-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED
# error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _RDSEEDINTRIN_H_INCLUDED
#define _RDSEEDINTRIN_H_INCLUDED

#ifndef __RDSEED__
#pragma GCC push_options
#pragma GCC target("rdseed")
#define __DISABLE_RDSEED__
#endif /* __RDSEED__ */


extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed16_step (unsigned short *__p)
{
  return __builtin_ia32_rdseed_hi_step (__p);
}

extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed32_step (unsigned int *__p)
{
  return __builtin_ia32_rdseed_si_step (__p);
}

#ifdef __x86_64__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed64_step (unsigned long long *__p)
{
  return __builtin_ia32_rdseed_di_step (__p);
}
#endif

#ifdef __DISABLE_RDSEED__
#undef __DISABLE_RDSEED__
#pragma GCC pop_options
#endif /* __DISABLE_RDSEED__ */

#endif /* _RDSEEDINTRIN_H_INCLUDED */
/* Copyright (C) 2014-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512VLDQINTRIN_H_INCLUDED
#define _AVX512VLDQINTRIN_H_INCLUDED

#if !defined(__AVX512VL__) || !defined(__AVX512DQ__)
#pragma GCC push_options
#pragma GCC target("avx512vl,avx512dq")
#define __DISABLE_AVX512VLDQ__
#endif /* __AVX512VLDQ__ */

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttpd_epi64 (__m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
						     (__v4di) __W,
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi64 (__m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
						     (__v2di) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttpd_epu64 (__m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
						      (__v4di)
						      _mm256_setzero_si256 (),
						      (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
						      (__v4di) __W,
						      (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
						      (__v4di)
						      _mm256_setzero_si256 (),
						      (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epu64 (__m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
						      (__v2di)
						      _mm_setzero_si128 (),
						      (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
						      (__v2di) __W,
						      (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
						      (__v2di)
						      _mm_setzero_si128 (),
						      (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtpd_epi64 (__m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
						    (__v4di)
						    _mm256_setzero_si256 (),
						    (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
						    (__v4di) __W,
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
						    (__v4di)
						    _mm256_setzero_si256 (),
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi64 (__m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
						    (__v2di)
						    _mm_setzero_si128 (),
						    (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
						    (__v2di) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
						    (__v2di)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtpd_epu64 (__m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
						     (__v4di) __W,
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A)
{
  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epu64 (__m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
						     (__v2di) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A)
{
  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttps_epi64 (__m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
						     (__v4di) __W,
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi64 (__m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
						     (__v2di) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvttps_epu64 (__m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
						      (__v4di)
						      _mm256_setzero_si256 (),
						      (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
						      (__v4di) __W,
						      (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
						      (__v4di)
						      _mm256_setzero_si256 (),
						      (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epu64 (__m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
						      (__v2di)
						      _mm_setzero_si128 (),
						      (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
						      (__v2di) __W,
						      (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
						      (__v2di)
						      _mm_setzero_si128 (),
						      (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_f64x2 (__m128d __A)
{
  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
							   __A,
						           (__v4df)_mm256_undefined_pd(),
							   (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
{
  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
							   __A,
							   (__v4df)
							   __O, __M);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
{
  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
							   __A,
							   (__v4df)
							   _mm256_setzero_ps (),
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_i64x2 (__m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
							   __A,
						           (__v4di)_mm256_undefined_si256(),
							   (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
							   __A,
							   (__v4di)
							   __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
							   __A,
							   (__v4di)
							   _mm256_setzero_si256 (),
							   __M);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_f32x2 (__m128 __A)
{
  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
						          (__v8sf)_mm256_undefined_ps(),
							  (__mmask8) -1);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
{
  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
							  (__v8sf) __O,
							  __M);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
{
  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
							  (__v8sf)
							  _mm256_setzero_ps (),
							  __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcast_i32x2 (__m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
							   __A,
						          (__v8si)_mm256_undefined_si256(),
							   (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
							   __A,
							   (__v8si)
							   __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
							   __A,
							   (__v8si)
							   _mm256_setzero_si256 (),
							   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcast_i32x2 (__m128i __A)
{
  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
							   __A,
						          (__v4si)_mm_undefined_si128(),
							   (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
							   __A,
							   (__v4si)
							   __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
							   __A,
							   (__v4si)
							   _mm_setzero_si128 (),
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi64 (__m256i __A, __m256i __B)
{
  return (__m256i) ((__v4du) __A * (__v4du) __B);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
						  (__v4di) __B,
						  (__v4di) __W,
						  (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
						  (__v4di) __B,
						  (__v4di)
						  _mm256_setzero_si256 (),
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi64 (__m128i __A, __m128i __B)
{
  return (__m128i) ((__v2du) __A * (__v2du) __B);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
						  (__v2di) __B,
						  (__v2di) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
						  (__v2di) __B,
						  (__v2di)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A,
		       __m256d __B)
{
  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
						  (__v4df) __B,
						  (__v4df) __W,
						  (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
						  (__v4df) __B,
						  (__v4df)
						  _mm256_setzero_pd (),
						  (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A,
		    __m128d __B)
{
  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
						  (__v2df) __B,
						  (__v2df) __W,
						  (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
						  (__v2df) __B,
						  (__v2df)
						  _mm_setzero_pd (),
						  (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A,
		       __m256 __B)
{
  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
						 (__v8sf) __B,
						 (__v8sf) __W,
						 (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
						 (__v8sf) __B,
						 (__v8sf)
						 _mm256_setzero_ps (),
						 (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf) __W,
						 (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
						 (__v4sf) __B,
						 (__v4sf)
						 _mm_setzero_ps (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_epi64 (__m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
						    (__v4di)
						    _mm256_setzero_si256 (),
						    (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
						    (__v4di) __W,
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
						    (__v4di)
						    _mm256_setzero_si256 (),
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi64 (__m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
						    (__v2di)
						    _mm_setzero_si128 (),
						    (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
						    (__v2di) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
						    (__v2di)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_epu64 (__m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
						     (__v4di) __W,
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
{
  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
						     (__v4di)
						     _mm256_setzero_si256 (),
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epu64 (__m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
						     (__v2di) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
{
  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
						     (__v2di)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi64_ps (__m256i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
						   (__v4sf) __W,
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi64_ps (__m128i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
						   (__v4sf) __W,
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A)
{
  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu64_ps (__m256i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
						    (__v4sf) __W,
						    (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu64_ps (__m128i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
						    (__v4sf) __W,
						    (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A)
{
  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
						    (__v4sf)
						    _mm_setzero_ps (),
						    (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi64_pd (__m256i __A)
{
  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
						    (__v4df)
						    _mm256_setzero_pd (),
						    (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A)
{
  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
						    (__v4df) __W,
						    (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A)
{
  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
						    (__v4df)
						    _mm256_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi64_pd (__m128i __A)
{
  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
						    (__v2df)
						    _mm_setzero_pd (),
						    (__mmask8) -1);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A)
{
  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
						    (__v2df) __W,
						    (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A)
{
  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
						    (__v2df)
						    _mm_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu64_pd (__m256i __A)
{
  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
						     (__v4df)
						     _mm256_setzero_pd (),
						     (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A)
{
  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
						     (__v4df) __W,
						     (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A)
{
  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
						     (__v4df)
						     _mm256_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A,
		    __m256d __B)
{
  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
						 (__v4df) __B,
						 (__v4df) __W,
						 (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
						 (__v4df) __B,
						 (__v4df)
						 _mm256_setzero_pd (),
						 (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
						(__v8sf) __B,
						(__v8sf) __W,
						(__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
						(__v8sf) __B,
						(__v8sf)
						_mm256_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu64_pd (__m128i __A)
{
  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
						     (__v2df)
						     _mm_setzero_pd (),
						     (__mmask8) -1);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A)
{
  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
						     (__v2df) __W,
						     (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A)
{
  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
						     (__v2df)
						     _mm_setzero_pd (),
						     (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
		    __m256d __B)
{
  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
						 (__v4df) __B,
						 (__v4df) __W,
						 (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
						 (__v4df) __B,
						 (__v4df)
						 _mm256_setzero_pd (),
						 (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df) __W,
						 (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
						 (__v2df) __B,
						 (__v2df)
						 _mm_setzero_pd (),
						 (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
						(__v8sf) __B,
						(__v8sf) __W,
						(__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
						(__v8sf) __B,
						(__v8sf)
						_mm256_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf) __W,
						(__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
						(__v4sf) __B,
						(__v4sf)
						_mm_setzero_ps (),
						(__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
						(__v4df) __B,
						(__v4df) __W,
						(__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B)
{
  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
						(__v4df) __B,
						(__v4df)
						_mm256_setzero_pd (),
						(__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
						(__v2df) __B,
						(__v2df) __W,
						(__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
						(__v2df) __B,
						(__v2df)
						_mm_setzero_pd (),
						(__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
					       (__v8sf) __B,
					       (__v8sf) __W,
					       (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B)
{
  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
					       (__v8sf) __B,
					       (__v8sf)
					       _mm256_setzero_ps (),
					       (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
					       (__v4sf) __B,
					       (__v4sf) __W,
					       (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
					       (__v4sf) __B,
					       (__v4sf)
					       _mm_setzero_ps (),
					       (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movm_epi32 (__mmask8 __A)
{
  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movm_epi32 (__mmask8 __A)
{
  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movm_epi64 (__mmask8 __A)
{
  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movm_epi64 (__mmask8 __A)
{
  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi32_mask (__m128i __A)
{
  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movepi32_mask (__m256i __A)
{
  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_mask (__m128i __A)
{
  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movepi64_mask (__m256i __A)
{
  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
}

#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extractf64x2_pd (__m256d __A, const int __imm)
{
  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
							 __imm,
							 (__v2df)
							 _mm_setzero_pd (),
							 (__mmask8) -1);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A,
			     const int __imm)
{
  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
							 __imm,
							 (__v2df) __W,
							 (__mmask8)
							 __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A,
			      const int __imm)
{
  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
							 __imm,
							 (__v2df)
							 _mm_setzero_pd (),
							 (__mmask8)
							 __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extracti64x2_epi64 (__m256i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
							 __imm,
							 (__v2di)
							 _mm_setzero_si128 (),
							 (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A,
				const int __imm)
{
  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
							 __imm,
							 (__v2di) __W,
							 (__mmask8)
							 __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A,
				 const int __imm)
{
  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
							 __imm,
							 (__v2di)
							 _mm_setzero_si128 (),
							 (__mmask8)
							 __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_pd (__m256d __A, int __B)
{
  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
						    (__v4df)
						    _mm256_setzero_pd (),
						    (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B)
{
  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
						    (__v4df) __W,
						    (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B)
{
  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
						    (__v4df)
						    _mm256_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_pd (__m128d __A, int __B)
{
  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
						    (__v2df)
						    _mm_setzero_pd (),
						    (__mmask8) -1);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B)
{
  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
						    (__v2df) __W,
						    (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B)
{
  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
						    (__v2df)
						    _mm_setzero_pd (),
						    (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_ps (__m256 __A, int __B)
{
  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
						   (__v8sf)
						   _mm256_setzero_ps (),
						   (__mmask8) -1);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B)
{
  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
						   (__v8sf) __W,
						   (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B)
{
  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
						   (__v8sf)
						   _mm256_setzero_ps (),
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_ps (__m128 __A, int __B)
{
  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B)
{
  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
						   (__v4sf) __W,
						   (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B)
{
  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
						   (__v4sf)
						   _mm_setzero_ps (),
						   (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_range_pd (__m256d __A, __m256d __B, int __C)
{
  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
						   (__v4df) __B, __C,
						   (__v4df)
						   _mm256_setzero_pd (),
						   (__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_range_pd (__m256d __W, __mmask8 __U,
		      __m256d __A, __m256d __B, int __C)
{
  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
						   (__v4df) __B, __C,
						   (__v4df) __W,
						   (__mmask8) __U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C)
{
  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
						   (__v4df) __B, __C,
						   (__v4df)
						   _mm256_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_range_pd (__m128d __A, __m128d __B, int __C)
{
  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
						   (__v2df) __B, __C,
						   (__v2df)
						   _mm_setzero_pd (),
						   (__mmask8) -1);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_range_pd (__m128d __W, __mmask8 __U,
		   __m128d __A, __m128d __B, int __C)
{
  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
						   (__v2df) __B, __C,
						   (__v2df) __W,
						   (__mmask8) __U);
}

extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
{
  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
						   (__v2df) __B, __C,
						   (__v2df)
						   _mm_setzero_pd (),
						   (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_range_ps (__m256 __A, __m256 __B, int __C)
{
  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
						  (__v8sf) __B, __C,
						  (__v8sf)
						  _mm256_setzero_ps (),
						  (__mmask8) -1);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B,
		      int __C)
{
  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
						  (__v8sf) __B, __C,
						  (__v8sf) __W,
						  (__mmask8) __U);
}

extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C)
{
  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
						  (__v8sf) __B, __C,
						  (__v8sf)
						  _mm256_setzero_ps (),
						  (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_range_ps (__m128 __A, __m128 __B, int __C)
{
  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
						  (__v4sf) __B, __C,
						  (__v4sf)
						  _mm_setzero_ps (),
						  (__mmask8) -1);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_range_ps (__m128 __W, __mmask8 __U,
		   __m128 __A, __m128 __B, int __C)
{
  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
						  (__v4sf) __B, __C,
						  (__v4sf) __W,
						  (__mmask8) __U);
}

extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C)
{
  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
						  (__v4sf) __B, __C,
						  (__v4sf)
						  _mm_setzero_ps (),
						  (__mmask8) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A,
			     const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
						      __imm, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fpclass_pd_mask (__m256d __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
						      __imm,
						      (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
						      __imm, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fpclass_ps_mask (__m256 __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
						      __imm,
						      (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
						      __imm, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fpclass_pd_mask (__m128d __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
						      __imm,
						      (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
						      __imm, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_fpclass_ps_mask (__m128 __A, const int __imm)
{
  return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
						      __imm,
						      (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm)
{
  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
							(__v2di) __B,
							__imm,
							(__v4di)
							_mm256_setzero_si256 (),
							(__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A,
			 __m128i __B, const int __imm)
{
  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
							(__v2di) __B,
							__imm,
							(__v4di) __W,
							(__mmask8)
							__U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B,
			  const int __imm)
{
  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
							(__v2di) __B,
							__imm,
							(__v4di)
							_mm256_setzero_si256 (),
							(__mmask8)
							__U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm)
{
  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
							(__v2df) __B,
							__imm,
							(__v4df)
							_mm256_setzero_pd (),
							(__mmask8) -1);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A,
			 __m128d __B, const int __imm)
{
  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
							(__v2df) __B,
							__imm,
							(__v4df) __W,
							(__mmask8)
							__U);
}

extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B,
			  const int __imm)
{
  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
							(__v2df) __B,
							__imm,
							(__v4df)
							_mm256_setzero_pd (),
							(__mmask8)
							__U);
}

#else
#define _mm256_insertf64x2(X, Y, C)                                     \
  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
    (__v2df)(__m128d) (Y), (int) (C),					\
    (__v4df)(__m256d)_mm256_setzero_pd(),				\
    (__mmask8)-1))

#define _mm256_mask_insertf64x2(W, U, X, Y, C)                          \
  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
    (__v2df)(__m128d) (Y), (int) (C),					\
    (__v4df)(__m256d)(W),						\
    (__mmask8)(U)))

#define _mm256_maskz_insertf64x2(U, X, Y, C)				\
  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
    (__v2df)(__m128d) (Y), (int) (C),					\
    (__v4df)(__m256d)_mm256_setzero_pd(),				\
    (__mmask8)(U)))

#define _mm256_inserti64x2(X, Y, C)                                     \
  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
    (__v2di)(__m128i) (Y), (int) (C),					\
    (__v4di)(__m256i)_mm256_setzero_si256 (),				\
    (__mmask8)-1))

#define _mm256_mask_inserti64x2(W, U, X, Y, C)                          \
  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
    (__v2di)(__m128i) (Y), (int) (C),					\
    (__v4di)(__m256i)(W),						\
    (__mmask8)(U)))

#define _mm256_maskz_inserti64x2(U, X, Y, C)                            \
  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
    (__v2di)(__m128i) (Y), (int) (C),					\
    (__v4di)(__m256i)_mm256_setzero_si256 (),				\
    (__mmask8)(U)))

#define _mm256_extractf64x2_pd(X, C)                                    \
  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
    (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1))

#define _mm256_mask_extractf64x2_pd(W, U, X, C)                         \
  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
    (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))

#define _mm256_maskz_extractf64x2_pd(U, X, C)                           \
  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
    (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U)))

#define _mm256_extracti64x2_epi64(X, C)                                 \
  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))

#define _mm256_mask_extracti64x2_epi64(W, U, X, C)                     \
  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
    (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))

#define _mm256_maskz_extracti64x2_epi64(U, X, C)                        \
  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))

#define _mm256_reduce_pd(A, B)						\
  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),	\
    (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1))

#define _mm256_mask_reduce_pd(W, U, A, B)				\
  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),	\
    (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))

#define _mm256_maskz_reduce_pd(U, A, B)					\
  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),	\
    (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))

#define _mm_reduce_pd(A, B)						\
  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),	\
    (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1))

#define _mm_mask_reduce_pd(W, U, A, B)					\
  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),	\
    (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))

#define _mm_maskz_reduce_pd(U, A, B)					\
  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),	\
    (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U)))

#define _mm256_reduce_ps(A, B)						\
  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),	\
    (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))

#define _mm256_mask_reduce_ps(W, U, A, B)				\
  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),	\
    (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))

#define _mm256_maskz_reduce_ps(U, A, B)					\
  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),	\
    (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))

#define _mm_reduce_ps(A, B)						\
  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),	\
    (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1))

#define _mm_mask_reduce_ps(W, U, A, B)					\
  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),	\
    (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))

#define _mm_maskz_reduce_ps(U, A, B)					\
  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),	\
    (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))

#define _mm256_range_pd(A, B, C)					\
  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),	\
    (__v4df)(__m256d)(B), (int)(C),					\
    (__v4df)_mm256_setzero_pd(), (__mmask8)-1))

#define _mm256_maskz_range_pd(U, A, B, C)				\
  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),	\
    (__v4df)(__m256d)(B), (int)(C),					\
    (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))

#define _mm_range_pd(A, B, C)						\
  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),	\
    (__v2df)(__m128d)(B), (int)(C),					\
    (__v2df)_mm_setzero_pd(), (__mmask8)-1))

#define _mm256_range_ps(A, B, C)					\
  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),	\
    (__v8sf)(__m256)(B), (int)(C),					\
    (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))

#define _mm256_mask_range_ps(W, U, A, B, C)				\
  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),	\
    (__v8sf)(__m256)(B), (int)(C),					\
    (__v8sf)(__m256)(W), (__mmask8)(U)))

#define _mm256_maskz_range_ps(U, A, B, C)				\
  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),	\
    (__v8sf)(__m256)(B), (int)(C),					\
    (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))

#define _mm_range_ps(A, B, C)						\
  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),	\
    (__v4sf)(__m128)(B), (int)(C),					\
    (__v4sf)_mm_setzero_ps(), (__mmask8)-1))

#define _mm_mask_range_ps(W, U, A, B, C)				\
  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),	\
    (__v4sf)(__m128)(B), (int)(C),					\
    (__v4sf)(__m128)(W), (__mmask8)(U)))

#define _mm_maskz_range_ps(U, A, B, C)					\
  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),	\
    (__v4sf)(__m128)(B), (int)(C),					\
    (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))

#define _mm256_mask_range_pd(W, U, A, B, C)				\
  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),	\
    (__v4df)(__m256d)(B), (int)(C),					\
    (__v4df)(__m256d)(W), (__mmask8)(U)))

#define _mm_mask_range_pd(W, U, A, B, C)				\
  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),	\
    (__v2df)(__m128d)(B), (int)(C),					\
    (__v2df)(__m128d)(W), (__mmask8)(U)))

#define _mm_maskz_range_pd(U, A, B, C)					\
  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),	\
    (__v2df)(__m128d)(B), (int)(C),					\
    (__v2df)_mm_setzero_pd(), (__mmask8)(U)))

#define _mm256_mask_fpclass_pd_mask(u, X, C)                            \
  ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
						(int) (C),(__mmask8)(u)))

#define _mm256_mask_fpclass_ps_mask(u, X, C)				\
  ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X),  \
						(int) (C),(__mmask8)(u)))

#define _mm_mask_fpclass_pd_mask(u, X, C)                               \
  ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
						(int) (C),(__mmask8)(u)))

#define _mm_mask_fpclass_ps_mask(u, X, C)                               \
  ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X),  \
						(int) (C),(__mmask8)(u)))

#define _mm256_fpclass_pd_mask(X, C)                                    \
  ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
						(int) (C),(__mmask8)-1))

#define _mm256_fpclass_ps_mask(X, C)                                    \
  ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X),  \
						(int) (C),(__mmask8)-1))

#define _mm_fpclass_pd_mask(X, C)                                       \
  ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
						(int) (C),(__mmask8)-1))

#define _mm_fpclass_ps_mask(X, C)                                       \
  ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X),  \
						(int) (C),(__mmask8)-1))

#endif

#ifdef __DISABLE_AVX512VLDQ__
#undef __DISABLE_AVX512VLDQ__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VLDQ__ */

#endif /* _AVX512VLDQINTRIN_H_INCLUDED */
/* Copyright (C) 2009-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
# error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
#endif

/* 32bit bsf */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsfd (int __X)
{
  return __builtin_ctz (__X);
}

/* 32bit bsr */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsrd (int __X)
{
  return __builtin_ia32_bsrsi (__X);
}

/* 32bit bswap */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bswapd (int __X)
{
  return __builtin_bswap32 (__X);
}

#ifndef __iamcu__

#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_2__ */

/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32b (unsigned int __C, unsigned char __V)
{
  return __builtin_ia32_crc32qi (__C, __V);
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32w (unsigned int __C, unsigned short __V)
{
  return __builtin_ia32_crc32hi (__C, __V);
}

extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32d (unsigned int __C, unsigned int __V)
{
  return __builtin_ia32_crc32si (__C, __V);
}

#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */

#endif /* __iamcu__ */

/* 32bit popcnt */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__popcntd (unsigned int __X)
{
  return __builtin_popcount (__X);
}

#ifndef __iamcu__

/* rdpmc */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdpmc (int __S)
{
  return __builtin_ia32_rdpmc (__S);
}

#endif /* __iamcu__ */

/* rdtsc */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtsc (void)
{
  return __builtin_ia32_rdtsc ();
}

#ifndef __iamcu__

/* rdtscp */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtscp (unsigned int *__A)
{
  return __builtin_ia32_rdtscp (__A);
}

#endif /* __iamcu__ */

/* 8bit rol */
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolb (unsigned char __X, int __C)
{
  return __builtin_ia32_rolqi (__X, __C);
}

/* 16bit rol */
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolw (unsigned short __X, int __C)
{
  return __builtin_ia32_rolhi (__X, __C);
}

/* 32bit rol */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rold (unsigned int __X, int __C)
{
  __C &= 31;
  return (__X << __C) | (__X >> (-__C & 31));
}

/* 8bit ror */
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorb (unsigned char __X, int __C)
{
  return __builtin_ia32_rorqi (__X, __C);
}

/* 16bit ror */
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorw (unsigned short __X, int __C)
{
  return __builtin_ia32_rorhi (__X, __C);
}

/* 32bit ror */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rord (unsigned int __X, int __C)
{
  __C &= 31;
  return (__X >> __C) | (__X << (-__C & 31));
}

/* Pause */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__pause (void)
{
  __builtin_ia32_pause ();
}

#ifdef __x86_64__
/* 64bit bsf */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsfq (long long __X)
{
  return __builtin_ctzll (__X);
}

/* 64bit bsr */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsrq (long long __X)
{
  return __builtin_ia32_bsrdi (__X);
}

/* 64bit bswap */
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bswapq (long long __X)
{
  return __builtin_bswap64 (__X);
}

#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_2__ */

/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32q (unsigned long long __C, unsigned long long __V)
{
  return __builtin_ia32_crc32di (__C, __V);
}

#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */

/* 64bit popcnt */
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__popcntq (unsigned long long __X)
{
  return __builtin_popcountll (__X);
}

/* 64bit rol */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolq (unsigned long long __X, int __C)
{
  __C &= 63;
  return (__X << __C) | (__X >> (-__C & 63));
}

/* 64bit ror */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorq (unsigned long long __X, int __C)
{
  __C &= 63;
  return (__X >> __C) | (__X << (-__C & 63));
}

/* Read flags register */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__readeflags (void)
{
  return __builtin_ia32_readeflags_u64 ();
}

/* Write flags register */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__writeeflags (unsigned long long __X)
{
  __builtin_ia32_writeeflags_u64 (__X);
}

#define _bswap64(a)		__bswapq(a)
#define _popcnt64(a)		__popcntq(a)
#else

/* Read flags register */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__readeflags (void)
{
  return __builtin_ia32_readeflags_u32 ();
}

/* Write flags register */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__writeeflags (unsigned int __X)
{
  __builtin_ia32_writeeflags_u32 (__X);
}

#endif

/* On LP64 systems, longs are 64-bit.  Use the appropriate rotate
 * function.  */
#ifdef __LP64__
#define _lrotl(a,b)		__rolq((a), (b))
#define _lrotr(a,b)		__rorq((a), (b))
#else
#define _lrotl(a,b)		__rold((a), (b))
#define _lrotr(a,b)		__rord((a), (b))
#endif

#define _bit_scan_forward(a)	__bsfd(a)
#define _bit_scan_reverse(a)	__bsrd(a)
#define _bswap(a)		__bswapd(a)
#define _popcnt32(a)		__popcntd(a)
#ifndef __iamcu__
#define _rdpmc(a)		__rdpmc(a)
#define _rdtscp(a)		__rdtscp(a)
#endif /* __iamcu__ */
#define _rdtsc()		__rdtsc()
#define _rotwl(a,b)		__rolw((a), (b))
#define _rotwr(a,b)		__rorw((a), (b))
#define _rotl(a,b)		__rold((a), (b))
#define _rotr(a,b)		__rord((a), (b))
/* Copyright (C) 2017-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _IMMINTRIN_H_INCLUDED
# error "Never use <avx512bitalgintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _AVX512BITALGINTRIN_H_INCLUDED
#define _AVX512BITALGINTRIN_H_INCLUDED

#ifndef __AVX512BITALG__
#pragma GCC push_options
#pragma GCC target("avx512bitalg")
#define __DISABLE_AVX512BITALG__
#endif /* __AVX512BITALG__ */

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi8 (__m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountb_v64qi ((__v64qi) __A);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi16 (__m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A);
}

#ifdef __DISABLE_AVX512BITALG__
#undef __DISABLE_AVX512BITALG__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALG__ */

#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512bw")
#define __DISABLE_AVX512BITALGBW__
#endif /* __AVX512VLBW__ */

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
							 (__v64qi) __W,
							 (__mmask64) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi8 (__mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
						(__v64qi)
						_mm512_setzero_si512 (),
						(__mmask64) __U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
							(__v32hi) __W,
							(__mmask32) __U);
}

extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi16 (__mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
						(__v32hi)
						_mm512_setzero_si512 (),
						(__mmask32) __U);
}

extern __inline __mmask64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__mmask64) -1);
}

extern __inline __mmask64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__mmask64) __M);
}

#ifdef __DISABLE_AVX512BITALGBW__
#undef __DISABLE_AVX512BITALGBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGBW__ */

#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
#define __DISABLE_AVX512BITALGVLBW__
#endif /* __AVX512VLBW__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
							 (__v32qi) __W,
							 (__mmask32) __U);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
						(__v32qi)
						 _mm256_setzero_si256 (),
						(__mmask32) __U);
}

extern __inline __mmask32
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__mmask32) -1);
}

extern __inline __mmask32
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__mmask32) __M);
}

#ifdef __DISABLE_AVX512BITALGVLBW__
#undef __DISABLE_AVX512BITALGVLBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGVLBW__ */


#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512vl")
#define __DISABLE_AVX512BITALGVL__
#endif /* __AVX512VLBW__ */

extern __inline __mmask16
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__mmask16) -1);
}

extern __inline __mmask16
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__mmask16) __M);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi8 (__m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi16 (__m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi8 (__m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi16 (__m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
							(__v16hi) __W,
							(__mmask16) __U);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
						(__v16hi)
						_mm256_setzero_si256 (),
						(__mmask16) __U);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
							 (__v16qi) __W,
							 (__mmask16) __U);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
							 (__v16qi)
							 _mm_setzero_si128 (),
							 (__mmask16) __U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
							(__v8hi) __W,
							(__mmask8) __U);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
							(__v8hi)
							_mm_setzero_si128 (),
							(__mmask8) __U);
}
#ifdef __DISABLE_AVX512BITALGVL__
#undef __DISABLE_AVX512BITALGVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGBW__ */

#endif /* _AVX512BITALGINTRIN_H_INCLUDED */
/* Copyright (C) 2014-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512BWINTRIN_H_INCLUDED
#define _AVX512BWINTRIN_H_INCLUDED

#ifndef __AVX512BW__
#pragma GCC push_options
#pragma GCC target("avx512bw")
#define __DISABLE_AVX512BW__
#endif /* __AVX512BW__ */

/* Internal data types for implementing the intrinsics.  */
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
typedef char __v64qi __attribute__ ((__vector_size__ (64)));

typedef unsigned long long __mmask64;

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
{
  *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
{
  *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
{
  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
{
  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
{
  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
{
  return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
{
  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
{
  *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
{
  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
{
  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
{
  return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
}

extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
{
  return (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kadd_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kadd_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline unsigned int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtmask32_u32 (__mmask32 __A)
{
  return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
}

extern __inline unsigned long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtmask64_u64 (__mmask64 __A)
{
  return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtu32_mask32 (unsigned int __A)
{
  return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_cvtu64_mask64 (unsigned long long __A)
{
  return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_load_mask32 (__mmask32 *__A)
{
  return (__mmask32) __builtin_ia32_kmovd (*__A);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_load_mask64 (__mmask64 *__A)
{
  return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_store_mask32 (__mmask32 *__A, __mmask32 __B)
{
  *(__mmask32 *) __A = __builtin_ia32_kmovd (__B);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_store_mask64 (__mmask64 *__A, __mmask64 __B)
{
  *(__mmask64 *) __A = __builtin_ia32_kmovq (__B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_knot_mask32 (__mmask32 __A)
{
  return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_knot_mask64 (__mmask64 __A)
{
  return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kor_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kor_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kxnor_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kxnor_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kxor_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kxor_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kand_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kand_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kandn_mask32 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kandn_mask64 (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
						     (__v32hi) __W,
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
{
  __builtin_ia32_storedquhi512_mask ((short *) __P,
				     (__v32hi) __A,
				     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
						    (__v64qi) __W,
						    (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
						    (__v64qi)
						    _mm512_setzero_si512 (),
						    (__mmask64) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
{
  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
					      (__mmask32) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kunpackw_mask32 (__mmask16 __A, __mmask16 __B)
{
  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
					      (__mmask32) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
{
  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
					      (__mmask64) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kunpackd_mask64 (__mmask32 __A, __mmask32 __B)
{
  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
					      (__mmask64) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
						     (__v64qi) __W,
						     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
{
  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
						     (__v64qi)
						     _mm512_setzero_si512 (),
						     (__mmask64) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
{
  __builtin_ia32_storedquqi512_mask ((char *) __P,
				     (__v64qi) __A,
				     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sad_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
					     (__v64qi) __B);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi16_epi8 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
						  (__v32qi) _mm256_undefined_si256(),
						  (__mmask32) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
{
  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
						  (__v32qi) __O, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsepi16_epi8 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
						   (__v32qi)_mm256_undefined_si256(),
						   (__mmask32) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
{
  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
						   (__v32qi)__O,
						   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
						   (__v32qi)
						   _mm256_setzero_si256 (),
						   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtusepi16_epi8 (__m512i __A)
{
  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
						    (__v32qi)_mm256_undefined_si256(),
						    (__mmask32) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
						    (__v32qi) __O,
						    __M);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
{
  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A)
{
  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
						    (__v32qi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastb_epi8 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
						       (__v64qi)_mm512_undefined_epi32(),
						       (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
						       (__v64qi) __O,
						       __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
						       (__v64qi)
						       _mm512_setzero_si512 (),
						       __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
{
  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
							   (__v64qi) __O,
							   __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
{
  return (__m512i)
	 __builtin_ia32_pbroadcastb512_gpr_mask (__A,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastw_epi16 (__m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
						       (__v32hi)_mm512_undefined_epi32(),
						       (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
						       (__v32hi) __O,
						       __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
{
  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
						       (__v32hi)
						       _mm512_setzero_si512 (),
						       __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
{
  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
							   (__v32hi) __O,
							   __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
{
  return (__m512i)
	 __builtin_ia32_pbroadcastw512_gpr_mask (__A,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			  __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			 __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
			 __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi) __W,
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			 __m512i __B)
{
  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepi8_epi16 (__m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtepu8_epi16 (__m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
{
  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
						     (__v32hi) __A,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
				__m512i __B)
{
  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
						     (__v32hi) __A,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
			       __m512i __B)
{
  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
						     (__v32hi) __A,
						     (__v32hi) __W,
						     (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
							/* idx */ ,
							(__v32hi) __A,
							(__v32hi) __B,
							(__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
				__m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
							/* idx */ ,
							(__v32hi) __A,
							(__v32hi) __B,
							(__mmask32)
							__U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
				 __mmask32 __U, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
							(__v32hi) __I
							/* idx */ ,
							(__v32hi) __B,
							(__mmask32)
							__U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
				 __m512i __I, __m512i __B)
{
  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
							 /* idx */ ,
							 (__v32hi) __A,
							 (__v32hi) __B,
							 (__mmask32)
							 __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_avg_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi) __W,
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi) __W,
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi) __W,
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
						 (__v64qi) __B,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_avg_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_subs_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_subs_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi)
						   _mm512_setzero_si512 (),
						   (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi) __W,
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi)
						   _mm512_setzero_si512 (),
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_adds_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_adds_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi)
						   _mm512_setzero_si512 (),
						   (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi) __W,
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
						   (__v64qi) __B,
						   (__v64qi)
						   _mm512_setzero_si512 (),
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_subs_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_subs_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi) __W,
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
						 (__v32hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_adds_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_adds_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi) __W,
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srl_epi16 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_packs_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi)
						    _mm512_setzero_si512 (),
						    (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sll_epi16 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m128i __B)
{
  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maddubs_epi16 (__m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
						     (__v64qi) __Y,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
			   __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
						     (__v64qi) __Y,
						     (__v32hi) __W,
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y)
{
  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
						     (__v64qi) __Y,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_madd_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v16si)
						   _mm512_setzero_si512 (),
						   (__mmask16) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v16si) __W,
						   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
						   (__v32hi) __B,
						   (__v16si)
						   _mm512_setzero_si512 (),
						   (__mmask16) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi)
						     _mm512_setzero_si512 (),
						     (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
			   __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi) __W,
						     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi)
						     _mm512_setzero_si512 (),
						     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpackhi_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			    __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi) __W,
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi)
						     _mm512_setzero_si512 (),
						     (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
			   __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi) __W,
						     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__v64qi)
						     _mm512_setzero_si512 (),
						     (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_unpacklo_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			    __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi) __W,
						     (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__v32hi)
						     _mm512_setzero_si512 (),
						     (__mmask32) __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
						    (__v64qi) __B, 0,
						    (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
						    (__v64qi) __B, 0,
						    __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
						    (__v32hi) __B, 0,
						    (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
						    (__v32hi) __B, 0,
						    __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
						    (__v64qi) __B, 6,
						    (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
						    (__v64qi) __B, 6,
						    __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
						     (__v64qi) __B,
						     __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
						    (__v32hi) __B, 6,
						    (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
						    (__v32hi) __B, 6,
						    __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
						     (__v32hi) __B,
						     __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movepi8_mask (__m512i __A)
{
  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movepi16_mask (__m512i __A)
{
  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movm_epi8 (__mmask64 __A)
{
  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_movm_epi16 (__mmask32 __A)
{
  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_test_epi8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
						(__v64qi) __B,
						(__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
						(__v64qi) __B, __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_test_epi16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
						(__v32hi) __B,
						(__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
						(__v32hi) __B, __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
						 (__v64qi) __B,
						 (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
						 (__v64qi) __B, __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
						 (__v32hi) __B,
						 (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
						 (__v32hi) __B, __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
			  __m512i __B)
{
  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epu8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_epi8 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi)
						  _mm512_setzero_si512 (),
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
		      __m512i __B)
{
  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
						  (__v64qi) __B,
						  (__v64qi) __W,
						  (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_epu16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
		       __m512i __B)
{
  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sra_epi16 (__m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
		       __m128i __B)
{
  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
{
  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
						 (__v8hi) __B,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srav_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srlv_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sllv_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B)
{
  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
						  (__v32hi) __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
			 __m512i __B)
{
  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi) __W,
						    (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi)
						    _mm512_setzero_si512 (),
						    __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_packus_epi16 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi)
						    _mm512_setzero_si512 (),
						    (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
			  __m512i __B)
{
  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi) __W,
						    (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
						    (__v32hi) __B,
						    (__v64qi)
						    _mm512_setzero_si512 (),
						    (__mmask64) __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_epi8 (__m512i __A)
{
  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
						 (__v64qi) __W,
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
						 (__v64qi)
						 _mm512_setzero_si512 (),
						 (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_abs_epi16 (__m512i __A)
{
  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
						 (__v32hi) __W,
						 (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
{
  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
						 (__v32hi)
						 _mm512_setzero_si512 (),
						 (__mmask32) __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 4,
						   (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 1,
						   (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 5,
						   (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 2,
						   (__mmask64) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 4,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 1,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 5,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 2,
						   (__mmask32) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 4,
						  (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 1,
						  (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 5,
						  (__mmask64) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 2,
						  (__mmask64) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 4,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 1,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 5,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 2,
						  (__mmask32) __M);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 4,
						   (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 1,
						   (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 5,
						   (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, 2,
						   (__mmask64) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 4,
						   (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 1,
						   (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 5,
						   (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, 2,
						   (__mmask32) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 4,
						  (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 1,
						  (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 5,
						  (__mmask64) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, 2,
						  (__mmask64) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 4,
						  (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 1,
						  (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 5,
						  (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, 2,
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_packs_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
			 __m512i __B)
{
  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi) __W,
						    __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_packus_epi32 (__m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
{
  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    __M);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
			  __m512i __B)
{
  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
						    (__v16si) __B,
						    (__v32hi) __W,
						    __M);
}

#ifdef __OPTIMIZE__
extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftli_mask32 (__mmask32 __A, unsigned int __B)
{
  return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A,
						(__mmask8) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftli_mask64 (__mmask64 __A, unsigned int __B)
{
  return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A,
						(__mmask8) __B);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftri_mask32 (__mmask32 __A, unsigned int __B)
{
  return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A,
						(__mmask8) __B);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_kshiftri_mask64 (__mmask64 __A, unsigned int __B)
{
  return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A,
						(__mmask8) __B);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N)
{
  return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A,
					      (__v8di) __B, __N * 8);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
			 __m512i __B, const int __N)
{
  return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
						   (__v8di) __B,
						   __N * 8,
						   (__v8di) __W,
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B,
			  const int __N)
{
  return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
						   (__v8di) __B,
						   __N * 8,
						   (__v8di)
						   _mm512_setzero_si512 (),
						   (__mmask64) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
						    (__v64qi) __B,
						    __imm,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A,
			__m512i __B, const int __imm)
{
  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
						    (__v64qi) __B,
						    __imm,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B,
			 const int __imm)
{
  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
						    (__v64qi) __B,
						    __imm,
						    (__v32hi)
						    _mm512_setzero_si512 (),
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srli_epi16 (__m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			const int __imm)
{
  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_slli_epi16 (__m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			const int __B)
{
  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
{
  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shufflehi_epi16 (__m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			     const int __imm)
{
  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi) __W,
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A,
			      const int __imm)
{
  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shufflelo_epi16 (__m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			     const int __imm)
{
  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi) __W,
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A,
			      const int __imm)
{
  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
						   __imm,
						   (__v32hi)
						   _mm512_setzero_si512 (),
						   (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_srai_epi16 (__m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
			const int __imm)
{
  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
						  (__v32hi) __W,
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm)
{
  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
						  (__v32hi)
						  _mm512_setzero_si512 (),
						  (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
{
  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
						    (__v32hi) __W,
						    (__mmask32) __U);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
{
  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
						    (__v64qi) __W,
						    (__mmask64) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, __P,
						  (__mmask32) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
						  (__v32hi) __Y, __P,
						  (__mmask32) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
			   const int __P)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, __P,
						  (__mmask64) __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
						  (__v64qi) __Y, __P,
						  (__mmask64) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
			    const int __P)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, __P,
						   (__mmask32) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
						   (__v32hi) __Y, __P,
						   (__mmask32) -1);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
			   const int __P)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, __P,
						   (__mmask64) __U);
}

extern __inline __mmask64
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P)
{
  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
						   (__v64qi) __Y, __P,
						   (__mmask64) -1);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_bslli_epi128 (__m512i __A, const int __N)
{
  return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8);
}

extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_bsrli_epi128 (__m512i __A, const int __N)
{
  return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8);
}

#else
#define _kshiftli_mask32(X, Y)							\
  ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y)))

#define _kshiftli_mask64(X, Y)							\
  ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y)))

#define _kshiftri_mask32(X, Y)							\
  ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y)))

#define _kshiftri_mask64(X, Y)							\
  ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y)))

#define _mm512_alignr_epi8(X, Y, N)						    \
  ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X),			    \
					(__v8di)(__m512i)(Y),			    \
					(int)((N) * 8)))

#define _mm512_mask_alignr_epi8(W, U, X, Y, N)					    \
  ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X),		    \
					    (__v8di)(__m512i)(Y), (int)((N) * 8),   \
					    (__v8di)(__m512i)(W), (__mmask64)(U)))

#define _mm512_maskz_alignr_epi8(U, X, Y, N)					    \
  ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X),		    \
					     (__v8di)(__m512i)(Y), (int)((N) * 8),  \
					     (__v8di)(__m512i)			    \
					     _mm512_setzero_si512 (),		    \
					     (__mmask64)(U)))

#define _mm512_dbsad_epu8(X, Y, C)                                                  \
  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
                                              (__v64qi)(__m512i) (Y), (int) (C),    \
                                              (__v32hi)(__m512i)		    \
					      _mm512_setzero_si512 (),		    \
                                              (__mmask32)-1))

#define _mm512_mask_dbsad_epu8(W, U, X, Y, C)                                       \
  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
                                              (__v64qi)(__m512i) (Y), (int) (C),    \
                                              (__v32hi)(__m512i)(W),                \
                                              (__mmask32)(U)))

#define _mm512_maskz_dbsad_epu8(U, X, Y, C)                                         \
  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
                                              (__v64qi)(__m512i) (Y), (int) (C),    \
                                              (__v32hi)(__m512i)		    \
					      _mm512_setzero_si512 (),		    \
                                              (__mmask32)(U)))

#define _mm512_srli_epi16(A, B)                                         \
  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))

#define _mm512_mask_srli_epi16(W, U, A, B)                              \
  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))

#define _mm512_maskz_srli_epi16(U, A, B)                                \
  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))

#define _mm512_slli_epi16(X, C)						   \
  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
    (__v32hi)(__m512i)_mm512_setzero_si512 (),				   \
    (__mmask32)-1))

#define _mm512_mask_slli_epi16(W, U, X, C)                                 \
  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
    (__v32hi)(__m512i)(W),\
    (__mmask32)(U)))

#define _mm512_maskz_slli_epi16(U, X, C)                                   \
  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
    (__v32hi)(__m512i)_mm512_setzero_si512 (),				   \
    (__mmask32)(U)))

#define _mm512_shufflehi_epi16(A, B)                                                \
  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)			    \
					     _mm512_setzero_si512 (),		    \
                                             (__mmask32)-1))

#define _mm512_mask_shufflehi_epi16(W, U, A, B)                                     \
  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)(W),                 \
                                             (__mmask32)(U)))

#define _mm512_maskz_shufflehi_epi16(U, A, B)                                       \
  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)			    \
					     _mm512_setzero_si512 (),		    \
                                             (__mmask32)(U)))

#define _mm512_shufflelo_epi16(A, B)                                                \
  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)			    \
					     _mm512_setzero_si512 (),		    \
                                             (__mmask32)-1))

#define _mm512_mask_shufflelo_epi16(W, U, A, B)                                     \
  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)(W),                 \
                                             (__mmask32)(U)))

#define _mm512_maskz_shufflelo_epi16(U, A, B)                                       \
  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
                                             (__v32hi)(__m512i)			    \
					     _mm512_setzero_si512 (),		    \
                                             (__mmask32)(U)))

#define _mm512_srai_epi16(A, B)                                         \
  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))

#define _mm512_mask_srai_epi16(W, U, A, B)                              \
  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))

#define _mm512_maskz_srai_epi16(U, A, B)                                \
  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))

#define _mm512_mask_blend_epi16(__U, __A, __W)			      \
  ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A),	      \
						    (__v32hi) (__W),  \
						    (__mmask32) (__U)))

#define _mm512_mask_blend_epi8(__U, __A, __W)			      \
  ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A),	      \
						    (__v64qi) (__W),  \
						    (__mmask64) (__U)))

#define _mm512_cmp_epi16_mask(X, Y, P)				\
  ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X),	\
					    (__v32hi)(__m512i)(Y), (int)(P),\
					    (__mmask32)(-1)))

#define _mm512_cmp_epi8_mask(X, Y, P)				\
  ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X),	\
					    (__v64qi)(__m512i)(Y), (int)(P),\
					    (__mmask64)(-1)))

#define _mm512_cmp_epu16_mask(X, Y, P)				\
  ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X),	\
					    (__v32hi)(__m512i)(Y), (int)(P),\
					    (__mmask32)(-1)))

#define _mm512_cmp_epu8_mask(X, Y, P)				\
  ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X),	\
					    (__v64qi)(__m512i)(Y), (int)(P),\
					    (__mmask64)(-1)))

#define _mm512_mask_cmp_epi16_mask(M, X, Y, P)				\
  ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X),	\
					    (__v32hi)(__m512i)(Y), (int)(P),\
					    (__mmask32)(M)))

#define _mm512_mask_cmp_epi8_mask(M, X, Y, P)				\
  ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X),	\
					    (__v64qi)(__m512i)(Y), (int)(P),\
					    (__mmask64)(M)))

#define _mm512_mask_cmp_epu16_mask(M, X, Y, P)				\
  ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X),	\
					    (__v32hi)(__m512i)(Y), (int)(P),\
					    (__mmask32)(M)))

#define _mm512_mask_cmp_epu8_mask(M, X, Y, P)				\
  ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X),	\
					    (__v64qi)(__m512i)(Y), (int)(P),\
					    (__mmask64)(M)))

#define _mm512_bslli_epi128(A, N)                                         \
  ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8))

#define _mm512_bsrli_epi128(A, N)                                         \
  ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8))

#endif

#ifdef __DISABLE_AVX512BW__
#undef __DISABLE_AVX512BW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BW__ */

#endif /* _AVX512BWINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED
# error "Never use <clflushoptintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED
#define _CLFLUSHOPTINTRIN_H_INCLUDED

#ifndef __CLFLUSHOPT__
#pragma GCC push_options
#pragma GCC target("clflushopt")
#define __DISABLE_CLFLUSHOPT__
#endif /* __CLFLUSHOPT__ */

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflushopt (void *__A)
{
  __builtin_ia32_clflushopt (__A);
}

#ifdef __DISABLE_CLFLUSHOPT__
#undef __DISABLE_CLFLUSHOPT__
#pragma GCC pop_options
#endif /* __DISABLE_CLFLUSHOPT__ */

#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */
/* Copyright (C) 2002-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef __CROSS_STDARG_H_INCLUDED
#define __CROSS_STDARG_H_INCLUDED

/* Make sure that for non x64 targets cross builtins are defined.  */
#ifndef __x86_64__
/* Call abi ms_abi.  */
#define __builtin_ms_va_list __builtin_va_list
#define __builtin_ms_va_copy __builtin_va_copy
#define __builtin_ms_va_start __builtin_va_start
#define __builtin_ms_va_end __builtin_va_end

/* Call abi sysv_abi.  */
#define __builtin_sysv_va_list __builtin_va_list
#define __builtin_sysv_va_copy __builtin_va_copy
#define __builtin_sysv_va_start __builtin_va_start
#define __builtin_sysv_va_end __builtin_va_end
#endif

#define __ms_va_copy(__d,__s) __builtin_ms_va_copy(__d,__s)
#define __ms_va_start(__v,__l) __builtin_ms_va_start(__v,__l)
#define __ms_va_arg(__v,__l)	__builtin_va_arg(__v,__l)
#define __ms_va_end(__v) __builtin_ms_va_end(__v)

#define __sysv_va_copy(__d,__s) __builtin_sysv_va_copy(__d,__s)
#define __sysv_va_start(__v,__l) __builtin_sysv_va_start(__v,__l)
#define __sysv_va_arg(__v,__l)	__builtin_va_arg(__v,__l)
#define __sysv_va_end(__v) __builtin_sysv_va_end(__v)

#ifndef __GNUC_SYSV_VA_LIST
#define __GNUC_SYSV_VA_LIST
  typedef __builtin_sysv_va_list __gnuc_sysv_va_list;
#endif

#ifndef _SYSV_VA_LIST_DEFINED
#define _SYSV_VA_LIST_DEFINED
  typedef __gnuc_sysv_va_list sysv_va_list;
#endif

#ifndef __GNUC_MS_VA_LIST
#define __GNUC_MS_VA_LIST
  typedef __builtin_ms_va_list __gnuc_ms_va_list;
#endif

#ifndef _MS_VA_LIST_DEFINED
#define _MS_VA_LIST_DEFINED
  typedef __gnuc_ms_va_list ms_va_list;
#endif

#endif /* __CROSS_STDARG_H_INCLUDED */
/* syslimits.h stands for the system's own limits.h file.
   If we can use it ok unmodified, then we install this text.
   If fixincludes fixes it, then the fixed version is installed
   instead of this text.  */

#define _GCC_NEXT_LIMITS_H		/* tell gcc's limits.h to recurse */
#include_next <limits.h>
#undef _GCC_NEXT_LIMITS_H
/* Copyright (C) 2014-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512VLBWINTRIN_H_INCLUDED
#define _AVX512VLBWINTRIN_H_INCLUDED

#if !defined(__AVX512VL__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512vl,avx512bw")
#define __DISABLE_AVX512VLBW__
#endif /* __AVX512VLBW__ */


extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
						    (__v32qi) __W,
						    (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
						    (__v32qi)
						    _mm256_setzero_si256 (),
						    (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
						    (__v16qi) __W,
						    (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
{
  __builtin_ia32_storedquqi256_mask ((char *) __P,
				     (__v32qi) __A,
				     (__mmask32) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
{
  __builtin_ia32_storedquqi128_mask ((char *) __P,
				     (__v16qi) __A,
				     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
{
  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
						     (__v16hi) __W,
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
{
  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
{
  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
						     (__v8hi) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
{
  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}


extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
{
  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
						     (__v32qi) __W,
						     (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
{
  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
						     (__v32qi)
						     _mm256_setzero_si256 (),
						     (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
{
  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
						     (__v16qi) __W,
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
{
  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
						     (__v16qi)
						     _mm_setzero_si128 (),
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi16_epi8 (__m256i __A)
{

  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
						  (__v16qi)_mm_undefined_si128(),
						  (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
{
  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
						  (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsepi16_epi8 (__m128i __A)
{

  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
						   (__v16qi)_mm_undefined_si128(),
						   (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
{
  __builtin_ia32_pmovswb128mem_mask ((__v8qi *) __P , (__v8hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
						   (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtsepi16_epi8 (__m256i __A)
{

  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
						   (__v16qi)_mm_undefined_si128(),
						   (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
{
  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
						   (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtusepi16_epi8 (__m128i __A)
{

  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
						    (__v16qi)_mm_undefined_si128(),
						    (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
{
  __builtin_ia32_pmovuswb128mem_mask ((__v8qi *) __P , (__v8hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
						    (__v16qi) __O,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtusepi16_epi8 (__m256i __A)
{

  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
						    (__v16qi)_mm_undefined_si128(),
						    (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
{
  __builtin_ia32_pmovuswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
						    (__v16qi) __O,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A)
{
  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
						       (__v32qi) __O,
						       __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
						       (__v32qi)
						       _mm256_setzero_si256 (),
						       __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
{
  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
							   (__v32qi) __O,
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
{
  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
							   (__v32qi)
							   _mm256_setzero_si256 (),
							   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
						       (__v16qi) __O,
						       __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
						       (__v16qi)
						       _mm_setzero_si128 (),
						       __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
{
  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
							   (__v16qi) __O,
							   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
{
  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
							   (__v16qi)
							   _mm_setzero_si128 (),
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
						       (__v16hi) __O,
						       __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
{
  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
						       (__v16hi)
						       _mm256_setzero_si256 (),
						       __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
{
  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
							   (__v16hi) __O,
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
{
  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
							   (__v16hi)
							   _mm256_setzero_si256 (),
							   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
						       (__v8hi) __O,
						       __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
						       (__v8hi)
						       _mm_setzero_si128 (),
						       __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
{
  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
							   (__v8hi) __O,
							   __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
{
  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
							   (__v8hi)
							   _mm_setzero_si128 (),
							   __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
						     (__v16hi) __A,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
				__m256i __B)
{
  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
						     (__v16hi) __A,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
			       __m256i __B)
{
  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
						     (__v16hi) __A,
						     (__v16hi) __W,
						     (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
						     (__v8hi) __A,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
						     (__v8hi) __A,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
			    __m128i __B)
{
  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
						     (__v8hi) __A,
						     (__v8hi) __W,
						     (__mmask8) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
{
  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
							/* idx */ ,
							(__v16hi) __A,
							(__v16hi) __B,
							(__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
				__m256i __I, __m256i __B)
{
  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
							/* idx */ ,
							(__v16hi) __A,
							(__v16hi) __B,
							(__mmask16)
							__U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
				 __mmask16 __U, __m256i __B)
{
  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
							(__v16hi) __I
							/* idx */ ,
							(__v16hi) __B,
							(__mmask16)
							__U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
				 __m256i __I, __m256i __B)
{
  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I
							 /* idx */ ,
							 (__v16hi) __A,
							 (__v16hi) __B,
							 (__mmask16)
							 __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
							/* idx */ ,
							(__v8hi) __A,
							(__v8hi) __B,
							(__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
			     __m128i __B)
{
  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
							/* idx */ ,
							(__v8hi) __A,
							(__v8hi) __B,
							(__mmask8)
							__U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
			      __m128i __B)
{
  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
							(__v8hi) __I
							/* idx */ ,
							(__v8hi) __B,
							(__mmask8)
							__U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
			      __m128i __B)
{
  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I
							 /* idx */ ,
							 (__v8hi) __A,
							 (__v8hi) __B,
							 (__mmask8)
							 __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
			   __m256i __Y)
{
  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
						     (__v32qi) __Y,
						     (__v16hi) __W,
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
{
  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
						     (__v32qi) __Y,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
			__m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
						     (__v16qi) __Y,
						     (__v8hi) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
						     (__v16qi) __Y,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v8si) __W,
						   (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v8si)
						   _mm256_setzero_si256 (),
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v4si) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v4si)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi8_mask (__m128i __A)
{
  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movepi8_mask (__m256i __A)
{
  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi16_mask (__m128i __A)
{
  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movepi16_mask (__m256i __A)
{
  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movm_epi8 (__mmask16 __A)
{
  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movm_epi8 (__mmask32 __A)
{
  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_movm_epi16 (__mmask8 __A)
{
  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movm_epi16 (__mmask16 __A)
{
  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_test_epi8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
						(__v16qi) __B,
						(__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
						(__v16qi) __B, __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_test_epi8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
						(__v32qi) __B,
						(__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
						(__v32qi) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_test_epi16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
					       (__v8hi) __B,
					       (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
					       (__v8hi) __B, __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_test_epi16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
						(__v16hi) __B,
						(__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
						(__v16hi) __B, __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __M);
}

#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
			 __m256i __B, const int __N)
{
  return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
						   (__v4di) __B,
						   __N * 8,
						   (__v4di) __W,
						   (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B,
			  const int __N)
{
  return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
						   (__v4di) __B,
						   __N * 8,
						   (__v4di)
						   _mm256_setzero_si256 (),
						   (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		      __m128i __B, const int __N)
{
  return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
						   (__v2di) __B,
						   __N * 8,
						   (__v2di) __W,
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B,
		       const int __N)
{
  return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
						   (__v2di) __B,
						   __N * 8,
						   (__v2di)
						   _mm_setzero_si128 (),
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm)
{
  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
						    (__v32qi) __B,
						    __imm,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B, const int __imm)
{
  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
						    (__v32qi) __B,
						    __imm,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B,
			 const int __imm)
{
  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
						    (__v32qi) __B,
						    __imm,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm)
{
  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
						    (__v16qi) __B,
						    __imm,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B, const int __imm)
{
  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
						    (__v16qi) __B,
						    __imm,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B,
		      const int __imm)
{
  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
						    (__v16qi) __B,
						    __imm,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
{
  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
{
  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
						    (__v16qi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
{
  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
{
  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
						    (__v32qi) __W,
						    (__mmask32) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
			 const int __P)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, __P,
						 (__mmask8) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, __P,
						 (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
			    const int __P)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, __P,
						  (__mmask16) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, __P,
						  (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
			const int __P)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, __P,
						  (__mmask16) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, __P,
						  (__mmask16) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
			   const int __P)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, __P,
						  (__mmask32) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, __P,
						  (__mmask32) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
			 const int __P)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, __P,
						  (__mmask8) __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, __P,
						  (__mmask8) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
			    const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, __P,
						   (__mmask16) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, __P,
						   (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
			const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, __P,
						   (__mmask16) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, __P,
						   (__mmask16) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
			   const int __P)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, __P,
						   (__mmask32) __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, __P,
						   (__mmask32) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			const int __imm)
{
  return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm)
{
  return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     const int __imm)
{
  return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			     const int __imm)
{
  return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
						   __imm,
						   (__v16hi) __W,
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A,
			      const int __imm)
{
  return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
						   __imm,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
			  const int __imm)
{
  return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
						   (__v8hi) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			     const int __imm)
{
  return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
						   __imm,
						   (__v16hi) __W,
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A,
			      const int __imm)
{
  return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
						   __imm,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
			  const int __imm)
{
  return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
						   (__v8hi) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			const int __imm)
{
  return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm)
{
  return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     const int __imm)
{
  return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm)
{
  return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			int __B)
{
  return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B)
{
  return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
{
  return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
{
  return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

#else
#define _mm256_mask_alignr_epi8(W, U, X, Y, N)					    \
  ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X),		    \
					    (__v4di)(__m256i)(Y), (int)((N) * 8),   \
					    (__v4di)(__m256i)(X), (__mmask32)(U)))

#define _mm256_mask_srli_epi16(W, U, A, B)                              \
  ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A),      \
    (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))

#define _mm256_maskz_srli_epi16(U, A, B)                                \
  ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A),      \
    (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))

#define _mm_mask_srli_epi16(W, U, A, B)                                 \
  ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A),       \
    (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))

#define _mm_maskz_srli_epi16(U, A, B)                                   \
  ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A),       \
    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))

#define _mm256_mask_srai_epi16(W, U, A, B)                              \
  ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
    (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))

#define _mm256_maskz_srai_epi16(U, A, B)                                \
  ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
    (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))

#define _mm_mask_srai_epi16(W, U, A, B)                                 \
  ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
    (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))

#define _mm_maskz_srai_epi16(U, A, B)                                   \
  ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))

#define _mm256_mask_shufflehi_epi16(W, U, A, B)                                     \
  ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
                                             (__v16hi)(__m256i)(W),                 \
                                             (__mmask16)(U)))

#define _mm256_maskz_shufflehi_epi16(U, A, B)                                       \
  ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
                                             (__v16hi)(__m256i)_mm256_setzero_si256 (), \
                                             (__mmask16)(U)))

#define _mm_mask_shufflehi_epi16(W, U, A, B)                                        \
  ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
                                             (__v8hi)(__m128i)(W),                  \
                                             (__mmask8)(U)))

#define _mm_maskz_shufflehi_epi16(U, A, B)                                          \
  ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
					     (__v8hi)(__m128i)_mm_setzero_si128 (), \
                                             (__mmask8)(U)))

#define _mm256_mask_shufflelo_epi16(W, U, A, B)                                     \
  ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
                                             (__v16hi)(__m256i)(W),                 \
                                             (__mmask16)(U)))

#define _mm256_maskz_shufflelo_epi16(U, A, B)                                       \
  ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
                                             (__v16hi)(__m256i)_mm256_setzero_si256 (), \
                                             (__mmask16)(U)))

#define _mm_mask_shufflelo_epi16(W, U, A, B)                                        \
  ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
                                             (__v8hi)(__m128i)(W),                  \
                                             (__mmask8)(U)))

#define _mm_maskz_shufflelo_epi16(U, A, B)                                          \
  ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
					     (__v8hi)(__m128i)_mm_setzero_si128 (), \
                                             (__mmask8)(U)))

#define _mm256_maskz_alignr_epi8(U, X, Y, N)					    \
  ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X),		    \
					    (__v4di)(__m256i)(Y), (int)((N) * 8),   \
					    (__v4di)(__m256i)_mm256_setzero_si256 (),   \
					    (__mmask32)(U)))

#define _mm_mask_alignr_epi8(W, U, X, Y, N)					    \
  ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X),		    \
					    (__v2di)(__m128i)(Y), (int)((N) * 8),   \
					    (__v2di)(__m128i)(X), (__mmask16)(U)))

#define _mm_maskz_alignr_epi8(U, X, Y, N)					    \
  ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X),		    \
					    (__v2di)(__m128i)(Y), (int)((N) * 8),   \
					    (__v2di)(__m128i)_mm_setzero_si128 (),  \
					    (__mmask16)(U)))

#define _mm_mask_slli_epi16(W, U, X, C)					  \
  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
    (__v8hi)(__m128i)(W),\
    (__mmask8)(U)))

#define _mm_maskz_slli_epi16(U, X, C)					  \
  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
    (__v8hi)(__m128i)_mm_setzero_si128 (),\
    (__mmask8)(U)))

#define _mm256_dbsad_epu8(X, Y, C)                                                  \
  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
                                              (__v32qi)(__m256i) (Y), (int) (C),    \
                                              (__v16hi)(__m256i)_mm256_setzero_si256(),\
                                              (__mmask16)-1))

#define _mm256_mask_slli_epi16(W, U, X, C)                                 \
  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
    (__v16hi)(__m256i)(W),\
    (__mmask16)(U)))

#define _mm256_maskz_slli_epi16(U, X, C)                                   \
  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
    (__v16hi)(__m256i)_mm256_setzero_si256 (),\
    (__mmask16)(U)))

#define _mm256_mask_dbsad_epu8(W, U, X, Y, C)                                       \
  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
                                              (__v32qi)(__m256i) (Y), (int) (C),    \
                                              (__v16hi)(__m256i)(W),                \
                                              (__mmask16)(U)))

#define _mm256_maskz_dbsad_epu8(U, X, Y, C)                                         \
  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
                                              (__v32qi)(__m256i) (Y), (int) (C),    \
                                              (__v16hi)(__m256i)_mm256_setzero_si256(),\
                                              (__mmask16)(U)))

#define _mm_dbsad_epu8(X, Y, C)                                                     \
  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
                                              (__v16qi)(__m128i) (Y), (int) (C),    \
                                              (__v8hi)(__m128i)_mm_setzero_si128(), \
                                              (__mmask8)-1))

#define _mm_mask_dbsad_epu8(W, U, X, Y, C)                                          \
  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
                                              (__v16qi)(__m128i) (Y), (int) (C),    \
                                              (__v8hi)(__m128i)(W),                 \
                                              (__mmask8)(U)))

#define _mm_maskz_dbsad_epu8(U, X, Y, C)                                            \
  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
                                              (__v16qi)(__m128i) (Y), (int) (C),    \
                                              (__v8hi)(__m128i)_mm_setzero_si128(), \
                                              (__mmask8)(U)))

#define _mm_mask_blend_epi16(__U, __A, __W)			      \
  ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A),	      \
						    (__v8hi) (__W),   \
						    (__mmask8) (__U)))

#define _mm_mask_blend_epi8(__U, __A, __W)			      \
  ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A),	      \
						    (__v16qi) (__W),  \
						    (__mmask16) (__U)))

#define _mm256_mask_blend_epi16(__U, __A, __W)			      \
  ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A),	      \
						    (__v16hi) (__W),  \
						    (__mmask16) (__U)))

#define _mm256_mask_blend_epi8(__U, __A, __W)			      \
  ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A),	      \
						    (__v32qi) (__W),  \
						    (__mmask32) (__U)))

#define _mm_cmp_epi16_mask(X, Y, P)				\
  ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X),	\
					    (__v8hi)(__m128i)(Y), (int)(P),\
					    (__mmask8)(-1)))

#define _mm_cmp_epi8_mask(X, Y, P)				\
  ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X),	\
					    (__v16qi)(__m128i)(Y), (int)(P),\
					    (__mmask16)(-1)))

#define _mm256_cmp_epi16_mask(X, Y, P)				\
  ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X),	\
					    (__v16hi)(__m256i)(Y), (int)(P),\
					    (__mmask16)(-1)))

#define _mm256_cmp_epi8_mask(X, Y, P)				\
  ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X),	\
					    (__v32qi)(__m256i)(Y), (int)(P),\
					    (__mmask32)(-1)))

#define _mm_cmp_epu16_mask(X, Y, P)				\
  ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X),	\
					    (__v8hi)(__m128i)(Y), (int)(P),\
					    (__mmask8)(-1)))

#define _mm_cmp_epu8_mask(X, Y, P)				\
  ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X),	\
					    (__v16qi)(__m128i)(Y), (int)(P),\
					    (__mmask16)(-1)))

#define _mm256_cmp_epu16_mask(X, Y, P)				\
  ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X),	\
					    (__v16hi)(__m256i)(Y), (int)(P),\
					    (__mmask16)(-1)))

#define _mm256_cmp_epu8_mask(X, Y, P)				\
  ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X),	\
					    (__v32qi)(__m256i)(Y), (int)(P),\
					    (__mmask32)-1))

#define _mm_mask_cmp_epi16_mask(M, X, Y, P)				\
  ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X),	\
					    (__v8hi)(__m128i)(Y), (int)(P),\
					    (__mmask8)(M)))

#define _mm_mask_cmp_epi8_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X),	\
					    (__v16qi)(__m128i)(Y), (int)(P),\
					    (__mmask16)(M)))

#define _mm256_mask_cmp_epi16_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X),	\
					    (__v16hi)(__m256i)(Y), (int)(P),\
					    (__mmask16)(M)))

#define _mm256_mask_cmp_epi8_mask(M, X, Y, P)				\
  ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X),	\
					    (__v32qi)(__m256i)(Y), (int)(P),\
					    (__mmask32)(M)))

#define _mm_mask_cmp_epu16_mask(M, X, Y, P)				\
  ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X),	\
					    (__v8hi)(__m128i)(Y), (int)(P),\
					    (__mmask8)(M)))

#define _mm_mask_cmp_epu8_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X),	\
					    (__v16qi)(__m128i)(Y), (int)(P),\
					    (__mmask16)(M)))

#define _mm256_mask_cmp_epu16_mask(M, X, Y, P)				\
  ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X),	\
					    (__v16hi)(__m256i)(Y), (int)(P),\
					    (__mmask16)(M)))

#define _mm256_mask_cmp_epu8_mask(M, X, Y, P)				\
  ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X),	\
					    (__v32qi)(__m256i)(Y), (int)(P),\
					    (__mmask32)(M)))
#endif

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 4,
						  (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 1,
						  (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 5,
						  (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmple_epi8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 2,
						  (__mmask32) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 4,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 1,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 5,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmple_epi16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 2,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 4,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epu8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 1,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_epu8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 5,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_epu8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 2,
						   (__mmask16) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 4,
						  (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epu16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 1,
						  (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_epu16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 5,
						  (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_epu16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 2,
						  (__mmask8) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 4,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 1,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_epi8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 5,
						  (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_epi8_mask (__m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 2,
						  (__mmask16) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 4,
						 (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 1,
						 (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_epi16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 5,
						 (__mmask8) -1);
}

extern __inline __mmask8
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_epi16_mask (__m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 2,
						 (__mmask8) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
			  __m256i __Y)
{
  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
						    (__v16hi) __Y,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
{
  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
						    (__v16hi) __Y,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi) __W,
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
		       __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
						    (__v8hi) __Y,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
{
  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
						    (__v8hi) __Y,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
{
  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
{
  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
{
  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
						    (__v16hi) __W,
						    (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
{
  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
						    (__v8hi) __W,
						    (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi) __W,
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi)
						 _mm256_setzero_si256 (),
						 (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi) __W,
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi)
						 _mm_setzero_si128 (),
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi) __W,
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi)
						 _mm256_setzero_si256 (),
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
						   (__v32qi) __B,
						   (__v32qi) __W,
						   (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
						   (__v32qi) __B,
						   (__v32qi)
						   _mm256_setzero_si256 (),
						   (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi) __W,
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
		      __m256i __B)
{
  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi) __W,
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
						 (__v32qi) __B,
						 (__v32qi)
						 _mm256_setzero_si256 (),
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
						 (__v16hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
		       __m256i __B)
{
  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
						   (__v32qi) __B,
						   (__v32qi) __W,
						   (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
						   (__v32qi) __B,
						   (__v32qi)
						   _mm256_setzero_si256 (),
						   (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi) __W,
						   (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
						   (__v16hi) __B,
						   (__v16hi)
						   _mm256_setzero_si256 (),
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi) __W,
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi)
						 _mm_setzero_si128 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
			   __m256i __B)
{
  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__v32qi) __W,
						     (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__v32qi)
						     _mm256_setzero_si256 (),
						     (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
			__m128i __B)
{
  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__v16qi) __W,
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__v16qi)
						     _mm_setzero_si128 (),
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			    __m256i __B)
{
  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__v16hi) __W,
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
			 __m128i __B)
{
  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
						     (__v8hi) __B,
						     (__v8hi) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
						     (__v8hi) __B,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
			   __m256i __B)
{
  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__v32qi) __W,
						     (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__v32qi)
						     _mm256_setzero_si256 (),
						     (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
			__m128i __B)
{
  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__v16qi) __W,
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__v16qi)
						     _mm_setzero_si128 (),
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			    __m256i __B)
{
  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__v16hi) __W,
						     (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__v16hi)
						     _mm256_setzero_si256 (),
						     (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
			 __m128i __B)
{
  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
						     (__v8hi) __B,
						     (__v8hi) __W,
						     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
						     (__v8hi) __B,
						     (__v8hi)
						     _mm_setzero_si128 (),
						     (__mmask8) __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epu8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
						    (__v16qi) __B, 0,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpeq_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
						    (__v16qi) __B, 0,
						    __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpeq_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epu8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
						    (__v32qi) __B, 0,
						    (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpeq_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
						    (__v32qi) __B, 0,
						    __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpeq_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epu16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
						   (__v8hi) __B, 0,
						   (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpeq_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
						   (__v8hi) __B, 0, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpeq_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
						    (__v8hi) __B, __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epu16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
						    (__v16hi) __B, 0,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpeq_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
						    (__v16hi) __B, 0,
						    __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpeq_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epu8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
						    (__v16qi) __B, 6,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpgt_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
						    (__v16qi) __B, 6,
						    __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpgt_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
						     (__v16qi) __B,
						     __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epu8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
						    (__v32qi) __B, 6,
						    (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpgt_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
						    (__v32qi) __B, 6,
						    __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpgt_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
						     (__v32qi) __B,
						     __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epu16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
						   (__v8hi) __B, 6,
						   (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpgt_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
						   (__v8hi) __B, 6, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpgt_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
						    (__v8hi) __B, __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epu16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
						    (__v16hi) __B, 6,
						    (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpgt_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
						    (__v16hi) __B, 6,
						    __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpgt_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
						     (__v16hi) __B,
						     __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_testn_epi8_mask (__m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
						 (__v16qi) __B,
						 (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
						 (__v16qi) __B, __U);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
						 (__v32qi) __B,
						 (__mmask32) -1);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
						 (__v32qi) __B, __U);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_testn_epi16_mask (__m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
						(__v8hi) __B,
						(__mmask8) -1);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
						(__v8hi) __B, __U);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
						 (__v16hi) __B,
						 (__mmask16) -1);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
						 (__v16hi) __B, __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
			  __m256i __B)
{
  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi) __W,
						  (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
						  (__v32qi) __B,
						  (__v32qi)
						  _mm256_setzero_si256 (),
						  (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		       __m128i __B)
{
  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
						    (__v16hi) __B,
						    (__v32qi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
						    (__v16hi) __B,
						    (__v32qi) __W,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__v16qi) __W,
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
						    (__v16hi) __B,
						    (__v32qi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
			  __m256i __B)
{
  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
						    (__v16hi) __B,
						    (__v32qi) __W,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__v16qi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
		       __m128i __B)
{
  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
						    (__v8hi) __B,
						    (__v16qi) __W,
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
						 (__v32qi) __W,
						 (__mmask32) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
						 (__v32qi)
						 _mm256_setzero_si256 (),
						 (__mmask32) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
						 (__v16qi) __W,
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
						 (__v16qi)
						 _mm_setzero_si128 (),
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
{
  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
{
  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 4,
						   (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 1,
						   (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 5,
						   (__mmask32) -1);
}

extern __inline __mmask32
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmple_epu8_mask (__m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 2,
						   (__mmask32) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 4,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 1,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 5,
						   (__mmask16) -1);
}

extern __inline __mmask16
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmple_epu16_mask (__m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 2,
						   (__mmask16) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
{
  __builtin_ia32_storedquhi256_mask ((short *) __P,
				     (__v16hi) __A,
				     (__mmask16) __U);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
{
  __builtin_ia32_storedquhi128_mask ((short *) __P,
				     (__v8hi) __A,
				     (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi) __W,
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
						   (__v16qi) __B,
						   (__v16qi) __W,
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
						   (__v16qi) __B,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m128i __B)
{
  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
{
  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m128i __B)
{
  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
{
  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
						  (__v8hi) __B,
						  (__v8hi)
						  _mm_setzero_si128 (),
						  (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
						   (__v16qi) __B,
						   (__v16qi) __W,
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
						   (__v16qi) __B,
						   (__v16qi)
						   _mm_setzero_si128 (),
						   (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi) __W,
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
						   (__v8hi) __B,
						   (__v8hi)
						   _mm_setzero_si128 (),
						   (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		   __m128i __B)
{
  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi) __W,
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
						 (__v16qi) __B,
						 (__v16qi)
						 _mm_setzero_si128 (),
						 (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi) __W,
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
						  (__v16qi) __B,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi8 (__m128i __A)
{

  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
						  (__v16qi)_mm_undefined_si128(),
						  (__mmask8) -1);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
{
  __builtin_ia32_pmovwb128mem_mask ((__v8qi *) __P , (__v8hi) __A, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
						  (__v16qi) __O, __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
						  (__v16qi)
						  _mm_setzero_si128 (),
						  __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srav_epi16 (__m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_srav_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srlv_epi16 (__m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_srlv_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sllv_epi16 (__m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) -1);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
			__m256i __B)
{
  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi) __W,
						  (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
						  (__v16hi) __B,
						  (__v16hi)
						  _mm256_setzero_si256 (),
						  (__mmask16) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sllv_epi16 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) -1);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		     __m128i __B)
{
  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
		    __m128i __B)
{
  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi) __W,
						 (__mmask8) __U);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
						 (__v8hi) __B,
						 (__v8hi)
						 _mm_setzero_si128 (),
						 (__mmask8) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
		       __m128i __B)
{
  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi) __W,
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
{
  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
						 (__v8hi) __B,
						 (__v16hi)
						 _mm256_setzero_si256 (),
						 (__mmask16) __U);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
						    (__v8si) __B,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
			  __m256i __B)
{
  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
						    (__v8si) __B,
						    (__v16hi) __W,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
						    (__v4si) __B,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
		       __m128i __B)
{
  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
						    (__v4si) __B,
						    (__v8hi) __W, __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
{
  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
						    (__v8si) __B,
						    (__v16hi)
						    _mm256_setzero_si256 (),
						    __M);
}

extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
			 __m256i __B)
{
  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
						    (__v8si) __B,
						    (__v16hi) __W,
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
						    (__v4si) __B,
						    (__v8hi)
						    _mm_setzero_si128 (),
						    __M);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
		      __m128i __B)
{
  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
						    (__v4si) __B,
						    (__v8hi) __W, __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpneq_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 4,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmplt_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 1,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpge_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 5,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmple_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
						   (__v16qi) __Y, 2,
						   (__mmask16) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpneq_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 4,
						  (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmplt_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 1,
						  (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpge_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 5,
						  (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmple_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
						  (__v8hi) __Y, 2,
						  (__mmask8) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpneq_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 4,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmplt_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 1,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpge_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 5,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmple_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
{
  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
						  (__v16qi) __Y, 2,
						  (__mmask16) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpneq_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 4,
						 (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmplt_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 1,
						 (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmpge_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 5,
						 (__mmask8) __M);
}

extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
{
  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
						 (__v8hi) __Y, 2,
						 (__mmask8) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 4,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 1,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 5,
						   (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
						   (__v32qi) __Y, 2,
						   (__mmask32) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 4,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 1,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 5,
						   (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
						   (__v16hi) __Y, 2,
						   (__mmask16) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 4,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 1,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 5,
						  (__mmask32) __M);
}

extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
{
  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
						  (__v32qi) __Y, 2,
						  (__mmask32) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 4,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 1,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 5,
						  (__mmask16) __M);
}

extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
{
  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
						  (__v16hi) __Y, 2,
						  (__mmask16) __M);
}

#ifdef __DISABLE_AVX512VLBW__
#undef __DISABLE_AVX512VLBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VLBW__ */

#endif /* _AVX512VLBWINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vnnivlintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED
#define _AVX512VNNIVLINTRIN_H_INCLUDED

#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__)
#pragma GCC push_options
#pragma GCC target("avx512vnni,avx512vl")
#define __DISABLE_AVX512VNNIVL__
#endif /* __AVX512VNNIVL__ */

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpbusd_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_dpbusd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask ((__v8si)__A, (__v8si) __C,
						(__v8si) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpbusd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz ((__v8si)__B,
				(__v8si) __C, (__v8si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpbusd_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpbusd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask ((__v4si)__A, (__v4si) __C,
						(__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_dpbusd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz ((__v4si)__B,
				(__v4si) __C, (__v4si) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpbusds_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_dpbusds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask ((__v8si)__A,
				(__v8si) __C, (__v8si) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpbusds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
								__m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz ((__v8si)__B,
				(__v8si) __C, (__v8si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpbusds_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpbusds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask ((__v4si)__A,
				(__v4si) __C, (__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_dpbusds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz ((__v4si)__B,
				(__v4si) __C, (__v4si) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpwssd_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_dpwssd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask ((__v8si)__A, (__v8si) __C,
						(__v8si) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpwssd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz ((__v8si)__B,
				(__v8si) __C, (__v8si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpwssd_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpwssd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask ((__v4si)__A, (__v4si) __C,
						(__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_dpwssd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz ((__v4si)__B,
				(__v4si) __C, (__v4si) __D, (__mmask8)__A);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpwssds_epi32 (__m256i __A, __m256i __B, __m256i __C)
{
  return (__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si)__A, (__v8si) __B,
								(__v8si) __C);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_dpwssds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask ((__v8si)__A,
				(__v8si) __C, (__v8si) __D, (__mmask8)__B);
}

extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpwssds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
							__m256i __D)
{
  return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz ((__v8si)__B,
				(__v8si) __C, (__v8si) __D, (__mmask8)__A);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpwssds_epi32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si)__A, (__v4si) __B,
								(__v4si) __C);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpwssds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask ((__v4si)__A,
				(__v4si) __C, (__v4si) __D, (__mmask8)__B);
}

extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_dpwssds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
{
  return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz ((__v4si)__B,
				(__v4si) __C, (__v4si) __D, (__mmask8)__A);
}
#ifdef __DISABLE_AVX512VNNIVL__
#undef __DISABLE_AVX512VNNIVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VNNIVL__ */
#endif /* __DISABLE_AVX512VNNIVL__ */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _SHAINTRIN_H_INCLUDED
#define _SHAINTRIN_H_INCLUDED

#ifndef __SHA__
#pragma GCC push_options
#pragma GCC target("sha")
#define __DISABLE_SHA__
#endif /* __SHA__ */

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1msg1_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1msg2_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1nexte_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B);
}

#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I)
{
  return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I);
}
#else
#define _mm_sha1rnds4_epu32(A, B, I)				    \
  ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)(A),	    \
				       (__v4si)(__m128i)(B), (int)(I)))
#endif

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256msg1_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256msg2_epu32 (__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B);
}

extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B,
					       (__v4si) __C);
}

#ifdef __DISABLE_SHA__
#undef __DISABLE_SHA__
#pragma GCC pop_options
#endif /* __DISABLE_SHA__ */

#endif /* _SHAINTRIN_H_INCLUDED */
/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
# error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _XOPMMINTRIN_H_INCLUDED
#define _XOPMMINTRIN_H_INCLUDED

#include <fma4intrin.h>

#ifndef __XOP__
#pragma GCC push_options
#pragma GCC target("xop")
#define __DISABLE_XOP__
#endif /* __XOP__ */

/* Integer multiply/add instructions. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
}

/* Packed Integer Horizontal Add and Subtract */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddw_epi8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epi8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epi16(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi16(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi32(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphadddq ((__v4si)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddw_epu8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epu8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epu16(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu16(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu32(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubw_epi8(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubd_epi16(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubq_epi32(__m128i __A)
{
  return  (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A);
}

/* Vector conditional move and permute */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpcmov (__A, __B, __C);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
{
  return  (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
}

/* Packed Integer Rotates and Shifts
   Rotates - Non-Immediate form */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi8(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi16(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi32(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi64(__m128i __A,  __m128i __B)
{
  return (__m128i)  __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B);
}

/* Rotates - Immediate form */

#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi8(__m128i __A, const int __B)
{
  return  (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi16(__m128i __A, const int __B)
{
  return  (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi32(__m128i __A, const int __B)
{
  return  (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi64(__m128i __A, const int __B)
{
  return  (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B);
}
#else
#define _mm_roti_epi8(A, N) \
  ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N)))
#define _mm_roti_epi16(A, N) \
  ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_roti_epi32(A, N) \
  ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N)))
#define _mm_roti_epi64(A, N) \
  ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N)))
#endif

/* Shifts */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi8(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi16(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi32(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi64(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B);
}


extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi8(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi16(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi32(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi64(__m128i __A,  __m128i __B)
{
  return  (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B);
}

/* Compare and Predicate Generation
   pcom (integer, unsigned bytes) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
}

/*pcom (integer, unsigned words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
}

/*pcom (integer, unsigned double words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
}

/*pcom (integer, unsigned quad words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B);
}

/*pcom (integer, signed bytes) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi8(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B);
}

/*pcom (integer, signed words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi16(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B);
}

/*pcom (integer, signed double words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi32(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B);
}

/*pcom (integer, signed quad words) */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B);
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi64(__m128i __A, __m128i __B)
{
  return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B);
}

/* FRCZ */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_ps (__m128 __A)
{
  return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A);
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_pd (__m128d __A)
{
  return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_ss (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_movss ((__v4sf)__A,
					(__v4sf)
					__builtin_ia32_vfrczss ((__v4sf)__B));
}

extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_sd (__m128d __A, __m128d __B)
{
  return (__m128d) __builtin_ia32_movsd ((__v2df)__A,
					 (__v2df)
					 __builtin_ia32_vfrczsd ((__v2df)__B));
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_frcz_ps (__m256 __A)
{
  return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_frcz_pd (__m256d __A)
{
  return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
}

/* PERMIL2 */

#ifdef __OPTIMIZE__
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
{
  return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
					      (__v2df)__Y,
					      (__v2di)__C,
					      __I);
}

extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
{
  return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
						 (__v4df)__Y,
						 (__v4di)__C,
						 __I);
}

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
{
  return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
					     (__v4sf)__Y,
					     (__v4si)__C,
					     __I);
}

extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
{
  return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
						(__v8sf)__Y,
						(__v8si)__C,
						__I);
}
#else
#define _mm_permute2_pd(X, Y, C, I)					\
  ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),		\
					(__v2df)(__m128d)(Y),		\
					(__v2di)(__m128i)(C),		\
					(int)(I)))

#define _mm256_permute2_pd(X, Y, C, I)					\
  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),	\
					   (__v4df)(__m256d)(Y),	\
					   (__v4di)(__m256i)(C),	\
					   (int)(I)))

#define _mm_permute2_ps(X, Y, C, I)					\
  ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),		\
				       (__v4sf)(__m128)(Y),		\
				       (__v4si)(__m128i)(C),		\
				       (int)(I)))

#define _mm256_permute2_ps(X, Y, C, I)					\
  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),		\
					  (__v8sf)(__m256)(Y),  	\
					  (__v8si)(__m256i)(C),		\
 					  (int)(I)))
#endif /* __OPTIMIZE__ */

#ifdef __DISABLE_XOP__
#undef __DISABLE_XOP__
#pragma GCC pop_options
#endif /* __DISABLE_XOP__ */

#endif /* _XOPMMINTRIN_H_INCLUDED */
/* Copyright (C) 2007-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _X86INTRIN_H_INCLUDED
# error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
#endif

#ifndef _LWPINTRIN_H_INCLUDED
#define _LWPINTRIN_H_INCLUDED

#ifndef __LWP__
#pragma GCC push_options
#pragma GCC target("lwp")
#define __DISABLE_LWP__
#endif /* __LWP__ */

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__llwpcb (void *__pcbAddress)
{
  __builtin_ia32_llwpcb (__pcbAddress);
}

extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__slwpcb (void)
{
  return __builtin_ia32_slwpcb ();
}

#ifdef __OPTIMIZE__
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
{
  __builtin_ia32_lwpval32 (__data2, __data1, __flags);
}

#ifdef __x86_64__
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpval64 (unsigned long long __data2, unsigned int __data1,
	    unsigned int __flags)
{
  __builtin_ia32_lwpval64 (__data2, __data1, __flags);
}
#endif
#else
#define __lwpval32(D2, D1, F) \
  (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \
			    (unsigned int) (F)))
#ifdef __x86_64__
#define __lwpval64(D2, D1, F) \
  (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \
			    (unsigned int) (F)))
#endif
#endif


#ifdef __OPTIMIZE__
extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
{
  return __builtin_ia32_lwpins32 (__data2, __data1, __flags);
}

#ifdef __x86_64__
extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpins64 (unsigned long long __data2, unsigned int __data1,
	    unsigned int __flags)
{
  return __builtin_ia32_lwpins64 (__data2, __data1, __flags);
}
#endif
#else
#define __lwpins32(D2, D1, F) \
  (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \
			    (unsigned int) (F)))
#ifdef __x86_64__
#define __lwpins64(D2, D1, F) \
  (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \
			    (unsigned int) (F)))
#endif
#endif

#ifdef __DISABLE_LWP__
#undef __DISABLE_LWP__
#pragma GCC pop_options
#endif /* __DISABLE_LWP__ */

#endif /* _LWPINTRIN_H_INCLUDED */
/* Copyright (C) 2013-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
#endif

#ifndef _AVX512PFINTRIN_H_INCLUDED
#define _AVX512PFINTRIN_H_INCLUDED

#ifndef __AVX512PF__
#pragma GCC push_options
#pragma GCC target("avx512pf")
#define __DISABLE_AVX512PF__
#endif /* __AVX512PF__ */

/* Internal data types for implementing the intrinsics.  */
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
typedef int __v16si __attribute__ ((__vector_size__ (64)));

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));

typedef unsigned char  __mmask8;
typedef unsigned short __mmask16;

#ifdef __OPTIMIZE__
extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr,
			      int __scale, int __hint)
{
  __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr,
			      int __scale, int __hint)
{
  __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask,
				   void const *__addr, int __scale, int __hint)
{
  __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale,
			      __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask,
				   void const *__addr, int __scale, int __hint)
{
  __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale,
			      __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr,
			      int __scale, int __hint)
{
  __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr,
			      int __scale, int __hint)
{
  __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask,
				   void const *__addr, int __scale, int __hint)
{
  __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale,
			      __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask,
				   void const *__addr, int __scale, int __hint)
{
  __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale,
			      __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale,
			       int __hint)
{
  __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale,
			       int __hint)
{
  __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask,
				    __m256i __index, int __scale, int __hint)
{
  __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale,
			       __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask,
				    __m512i __index, int __scale, int __hint)
{
  __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale,
			       __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale,
			       int __hint)
{
  __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale,
			       int __hint)
{
  __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
			      __scale, __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask16 __mask,
				    __m512i __index, int __scale, int __hint)
{
  __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale,
			      __hint);
}

extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask16 __mask,
				    __m512i __index, int __scale, int __hint)
{
  __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale,
			       __hint);
}

#else
#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT)		     \
  __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX),     \
			      (void const *) (ADDR), (int) (SCALE),	     \
			      (int) (HINT))

#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT)		     \
  __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), \
			      (void const *) (ADDR), (int) (SCALE),	     \
			      (int) (HINT))

#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
  __builtin_ia32_gatherpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX),  \
			      (void const *) (ADDR), (int) (SCALE),	     \
			      (int) (HINT))

#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
  __builtin_ia32_gatherpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX),\
			      (void const *) (ADDR), (int) (SCALE),	     \
			      (int) (HINT))

#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT)		     \
  __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),     \
			      (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT)		     \
  __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),     \
			      (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
  __builtin_ia32_gatherpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX),  \
			      (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
  __builtin_ia32_gatherpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX),  \
			      (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT)              \
  __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX),    \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT)              \
  __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX),\
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
  __builtin_ia32_scatterpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
  __builtin_ia32_scatterpfdps ((__mmask16) (MASK),			     \
			       (__v16si)(__m512i) (INDEX),		     \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT)              \
  __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),    \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT)              \
  __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),    \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
  __builtin_ia32_scatterpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))

#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
  __builtin_ia32_scatterpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
			       (void *) (ADDR), (int) (SCALE), (int) (HINT))
#endif

#ifdef __DISABLE_AVX512PF__
#undef __DISABLE_AVX512PF__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512PF__ */

#endif /* _AVX512PFINTRIN_H_INCLUDED */
/* Copyright (C) 1992-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* This administrivia gets added to the beginning of limits.h
   if the system has its own version of limits.h.  */

/* We use _GCC_LIMITS_H_ because we want this not to match
   any macros that the system's limits.h uses for its own purposes.  */
#ifndef _GCC_LIMITS_H_  /* Terminated in limity.h.  */
#define _GCC_LIMITS_H_

#ifndef _LIBC_LIMITS_H_
/* Use "..." so that we find syslimits.h only in this same directory.  */
#include "syslimits.h"
#endif
/* Copyright (C) 1991-2018 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

#ifndef _LIMITS_H___
#define _LIMITS_H___

/* Number of bits in a `char'.  */
#undef CHAR_BIT
#define CHAR_BIT __CHAR_BIT__

/* Maximum length of a multibyte character.  */
#ifndef MB_LEN_MAX
#define MB_LEN_MAX 1
#endif

/* Minimum and maximum values a `signed char' can hold.  */
#undef SCHAR_MIN
#define SCHAR_MIN (-SCHAR_MAX - 1)
#undef SCHAR_MAX
#define SCHAR_MAX __SCHAR_MAX__

/* Maximum value an `unsigned char' can hold.  (Minimum is 0).  */
#undef UCHAR_MAX
#if __SCHAR_MAX__ == __INT_MAX__
# define UCHAR_MAX (SCHAR_MAX * 2U + 1U)
#else
# define UCHAR_MAX (SCHAR_MAX * 2 + 1)
#endif

/* Minimum and maximum values a `char' can hold.  */
#ifdef __CHAR_UNSIGNED__
# undef CHAR_MIN
# if __SCHAR_MAX__ == __INT_MAX__
#  define CHAR_MIN 0U
# else
#  define CHAR_MIN 0
# endif
# undef CHAR_MAX
# define CHAR_MAX UCHAR_MAX
#else
# undef CHAR_MIN
# define CHAR_MIN SCHAR_MIN
# undef CHAR_MAX
# define CHAR_MAX SCHAR_MAX
#endif

/* Minimum and maximum values a `signed short int' can hold.  */
#undef SHRT_MIN
#define SHRT_MIN (-SHRT_MAX - 1)
#undef SHRT_MAX
#define SHRT_MAX __SHRT_MAX__

/* Maximum value an `unsigned short int' can hold.  (Minimum is 0).  */
#undef USHRT_MAX
#if __SHRT_MAX__ == __INT_MAX__
# define USHRT_MAX (SHRT_MAX * 2U + 1U)
#else
# define USHRT_MAX (SHRT_MAX * 2 + 1)
#endif

/* Minimum and maximum values a `signed int' can hold.  */
#undef INT_MIN
#define INT_MIN (-INT_MAX - 1)
#undef INT_MAX
#define INT_MAX __INT_MAX__

/* Maximum value an `unsigned int' can hold.  (Minimum is 0).  */
#undef UINT_MAX
#define UINT_MAX (INT_MAX * 2U + 1U)

/* Minimum and maximum values a `signed long int' can hold.
   (Same as `int').  */
#undef LONG_MIN
#define LONG_MIN (-LONG_MAX - 1L)
#undef LONG_MAX
#define LONG_MAX __LONG_MAX__

/* Maximum value an `unsigned long int' can hold.  (Minimum is 0).  */
#undef ULONG_MAX
#define ULONG_MAX (LONG_MAX * 2UL + 1UL)

#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* Minimum and maximum values a `signed long long int' can hold.  */
# undef LLONG_MIN
# define LLONG_MIN (-LLONG_MAX - 1LL)
# undef LLONG_MAX
# define LLONG_MAX __LONG_LONG_MAX__

/* Maximum value an `unsigned long long int' can hold.  (Minimum is 0).  */
# undef ULLONG_MAX
# define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL)
#endif

#if defined (__GNU_LIBRARY__) ? defined (__USE_GNU) : !defined (__STRICT_ANSI__)
/* Minimum and maximum values a `signed long long int' can hold.  */
# undef LONG_LONG_MIN
# define LONG_LONG_MIN (-LONG_LONG_MAX - 1LL)
# undef LONG_LONG_MAX
# define LONG_LONG_MAX __LONG_LONG_MAX__

/* Maximum value an `unsigned long long int' can hold.  (Minimum is 0).  */
# undef ULONG_LONG_MAX
# define ULONG_LONG_MAX (LONG_LONG_MAX * 2ULL + 1ULL)
#endif

#ifdef __STDC_WANT_IEC_60559_BFP_EXT__
/* TS 18661-1 widths of integer types.  */
# undef CHAR_WIDTH
# define CHAR_WIDTH __SCHAR_WIDTH__
# undef SCHAR_WIDTH
# define SCHAR_WIDTH __SCHAR_WIDTH__
# undef UCHAR_WIDTH
# define UCHAR_WIDTH __SCHAR_WIDTH__
# undef SHRT_WIDTH
# define SHRT_WIDTH __SHRT_WIDTH__
# undef USHRT_WIDTH
# define USHRT_WIDTH __SHRT_WIDTH__
# undef INT_WIDTH
# define INT_WIDTH __INT_WIDTH__
# undef UINT_WIDTH
# define UINT_WIDTH __INT_WIDTH__
# undef LONG_WIDTH
# define LONG_WIDTH __LONG_WIDTH__
# undef ULONG_WIDTH
# define ULONG_WIDTH __LONG_WIDTH__
# undef LLONG_WIDTH
# define LLONG_WIDTH __LONG_LONG_WIDTH__
# undef ULLONG_WIDTH
# define ULLONG_WIDTH __LONG_LONG_WIDTH__
#endif

#endif /* _LIMITS_H___ */
/* This administrivia gets added to the end of limits.h
   if the system has its own version of limits.h.  */

#else /* not _GCC_LIMITS_H_ */

#ifdef _GCC_NEXT_LIMITS_H
#include_next <limits.h>		/* recurse down to the real one */
#endif

#endif /* not _GCC_LIMITS_H_ */
/* Copyright (C) 2009-2018 Free Software Foundation, Inc.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
# error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
#endif


#ifndef _LZCNTINTRIN_H_INCLUDED
#define _LZCNTINTRIN_H_INCLUDED

#ifndef __LZCNT__
#pragma GCC push_options
#pragma GCC target("lzcnt")
#define __DISABLE_LZCNT__
#endif /* __LZCNT__ */

extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt16 (unsigned short __X)
{
  return __builtin_ia32_lzcnt_u16 (__X);
}

extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt32 (unsigned int __X)
{
  return __builtin_ia32_lzcnt_u32 (__X);
}

extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_lzcnt_u32 (unsigned int __X)
{
  return __builtin_ia32_lzcnt_u32 (__X);
}

#ifdef __x86_64__
extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt64 (unsigned long long __X)
{
  return __builtin_ia32_lzcnt_u64 (__X);
}

extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_lzcnt_u64 (unsigned long long __X)
{
  return __builtin_ia32_lzcnt_u64 (__X);
}
#endif

#ifdef __DISABLE_LZCNT__
#undef __DISABLE_LZCNT__
#pragma GCC pop_options
#endif /* __DISABLE_LZCNT__ */

#endif /* _LZCNTINTRIN_H_INCLUDED */