Documentation for vector extensions

From-SVN: r45880
2001-09-29 16:33:20 +00:00 · 2001-09-29 16:33:20 +00:00 · 1255c85c04
commit 1255c85c04
parent 86be733d75
4 changed files with 466 additions and 9 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,11 @@
+2001-09-29  Bernd Schmidt  <bernds@redhat.com>
+
+	* config/i386/i386.c (init_mmx_sse_builtins): Fix type of storelps and
+	storehps builtins.
+	* doc/extend.texi (Vector Extensions): New node.
+	* doc/invoke.texi (Machine Dependent Options): Add documentation for
+        i386 -mmmx, -msse, -m3dnow.
+
 Sat Sep 29 15:08:16 CEST 2001  Jan Hubicka  <jh@suse.cz>

 	* doc/invoke.texi (Optimize Options): Revert an accidental checkin.
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -10989,10 +10989,10 @@ ix86_init_mmx_sse_builtins ()
 			   tree_cons (NULL_TREE, V4SF_type_node,
 				      tree_cons (NULL_TREE, pv2si_type_node,
 						 endlink)));
-  tree v4sf_ftype_pv2si_v4sf
-    = build_function_type (V4SF_type_node,
-			   tree_cons (NULL_TREE, V4SF_type_node,
-				      tree_cons (NULL_TREE, pv2si_type_node,
+  tree void_ftype_pv2si_v4sf
+    = build_function_type (void_type_node,
+			   tree_cons (NULL_TREE, pv2si_type_node,
+				      tree_cons (NULL_TREE, V4SF_type_node,
 						 endlink)));
  tree void_ftype_pfloat_v4sf
    = build_function_type (void_type_node,
@ -11151,9 +11151,9 @@ ix86_init_mmx_sse_builtins ()
  for (i = 0, d = bdesc_comi; i < sizeof (bdesc_comi) / sizeof *d; i++, d++)
    def_builtin (d->mask, d->name, int_ftype_v4sf_v4sf, d->code);

-  def_builtin (MASK_SSE, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB);
-  def_builtin (MASK_SSE, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
-  def_builtin (MASK_SSE, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);
+  def_builtin (MASK_MMX, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB);
+  def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
+  def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);

  def_builtin (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS);
  def_builtin (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI);
@ -11176,8 +11176,8 @@ ix86_init_mmx_sse_builtins ()

  def_builtin (MASK_SSE, "__builtin_ia32_loadhps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADHPS);
  def_builtin (MASK_SSE, "__builtin_ia32_loadlps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADLPS);
-  def_builtin (MASK_SSE, "__builtin_ia32_storehps", v4sf_ftype_pv2si_v4sf, IX86_BUILTIN_STOREHPS);
-  def_builtin (MASK_SSE, "__builtin_ia32_storelps", v4sf_ftype_pv2si_v4sf, IX86_BUILTIN_STORELPS);
+  def_builtin (MASK_SSE, "__builtin_ia32_storehps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STOREHPS);
+  def_builtin (MASK_SSE, "__builtin_ia32_storelps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STORELPS);

  def_builtin (MASK_SSE, "__builtin_ia32_movmskps", int_ftype_v4sf, IX86_BUILTIN_MOVMSKPS);
  def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pmovmskb", int_ftype_v8qi, IX86_BUILTIN_PMOVMSKB);
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@ -430,6 +430,7 @@ extensions, accepted by GCC in C89 mode and in C++.
 * Function Names::	Printable strings which are the name of the current
 			 function.
 * Return Address::      Getting the return or frame address of a function.
+* Vector Extensions::   Using vector instructions through built-in functions.
 * Other Builtins::      Other built-in functions.
 * Pragmas::             Pragmas accepted by GCC.
@end menu
@ -483,6 +484,7 @@ extensions, accepted by GCC in C89 mode and in C++.
 * Function Names::	Printable strings which are the name of the current
 			 function.
 * Return Address::      Getting the return or frame address of a function.
+* Vector Extensions::   Using vector instructions through built-in functions.
 * Other Builtins::      Other built-in functions.
 * Pragmas::             Pragmas accepted by GCC.
@end menu
@ -4147,6 +4149,75 @@ This function should only be used with a non-zero argument for debugging
 purposes.
@end deftypefn

+@node Vector Extensions
+@section Using vector instructions through built-in functions
+
+On some targets, the instruction set contains SIMD vector instructions that
+operate on multiple values contained in one large register at the same time.
+For example, on the i386 the MMX, 3Dnow! and SSE extensions can be used
+this way.
+
+The first step in using these extensions is to provide the necessary data
+types.  This should be done using an appropriate @code{typedef}:
+
+@example
+typedef int v4si __attribute__ ((mode(V4SI)));
+@end example
+
+The base type @code{int} is effectively ignored by the compiler, the
+actual properties of the new type @code{v4si} are defined by the
+@code{__attribute__}.  It defines the machine mode to be used; for vector
+types these have the form @code{VnB}; @code{n} should be the number of
+elements in the vector, and @code{B} should be the base mode of the
+individual elements.  The following can be used as base modes:
+
+@table @code
+@item QI
+An integer that is as wide as the smallest addressable unit, usually 8 bits.
+@item HI
+An integer, twice as wide as a QI mode integer, usually 16 bits.
+@item SI
+An integer, four times as wide as a QI mode integer, usually 32 bits.
+@item DI
+An integer, eight times as wide as a QI mode integer, usually 64 bits.
+@item SF
+A floating point value, as wide as a SI mode integer, usually 32 bits.
+@item DF
+A floating point value, as wide as a DI mode integer, usually 64 bits.
+@end table
+
+Not all base types or combinations are always valid; which modes can be used
+is determined by the target machine.  For example, if targetting the i386 MMX
+extensions, only @code{V8QI}, @code{V4HI} and @code{V2SI} are allowed modes.
+
+There are no @code{V1xx} vector modes - they would be identical to the
+corresponding base mode.
+
+There is no distinction between signed and unsigned vector modes.  This
+distinction is made by the operations that perform on the vectors, not
+by the data type.
+
+The types defined in this manner are somewhat special, they cannot be
+used with most normal C operations (i.e., a vector addition can @emph{not}
+be represented by a normal addition of two vector type variables).  You
+can declare only variables and use them in function calls and returns, as
+well as in assignments and some casts.  It is possible to cast from one
+vector type to another, provided they are of the same size (in fact, you
+can also cast vectors to and from other datatypes of the same size).
+
+A port that supports vector operations provides a set of built-in functions
+that can be used to operate on vectors.  For example, a function to add two
+vectors and multiply the result by a third could look like this:
+
+@example
+v4si f (v4si a, v4si b, v4si c)
+@{
+  v4si tmp = __builtin_addv4si (a, b);
+  return __builtin_mulv4si (tmp, c);
+@}
+
+@end example
+
@node Other Builtins
@section Other built-in functions provided by GCC
@cindex built-in functions
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -471,6 +471,7 @@ in the following sections.
 -mno-fp-ret-in-387  -msoft-float  -msvr3-shlib @gol
 -mno-wide-multiply  -mrtd  -malign-double @gol
 -mpreferred-stack-boundary=@var{num} @gol
+-mmmx  -msse  -m3dnow @gol
 -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -momit-leaf-frame-pointer}
@ -7600,6 +7601,383 @@ to stack space usage, such as embedded systems and operating system kernels,
 may want to reduce the preferred alignment to
@option{-mpreferred-stack-boundary=2}.

+@item -mmmx
+@itemx -mno-mmx
+@item -msse
+@itemx -mno-sse
+@item -m3dnow
+@itemx -mno-3dnow
+@opindex mmmx
+@opindex mno-mmx
+@opindex msse
+@opindex mno-sse
+@opindex m3dnow
+@opindex mno-3dnow
+These switches enable or disable the use of built-in functions that allow
+direct access to the MMX, SSE and 3Dnow extensions of the instruction set.
+
+The following machine modes are available for use with MMX builtins
+(@pxref{Vector Extensions}): @code{V2SI} for a vector of two 32 bit integers,
+@code{V4HI} for a vector of four 16 bit integers, and @code{V8QI} for a
+vector of eight 8 bit integers.  Some of the builtins operate on MMX
+registers as a whole 64 bit entity, these use @code{DI} as their mode.
+
+If 3Dnow extensions are enabled, @code{V2SF} is used as a mode for a vector
+of two 32 bit floating point values.
+
+If SSE extensions are enabled, @code{V4SF} is used for a vector of four 32 bit
+floating point values.  Some instructions use a vector of four 32 bit
+integers, these use @code{V4SI}.  Finally, some instructions operate on an
+entire vector register, interpreting it as a 128 bit integer, these use mode
+@code{TI}.
+
+The following builtins are made available by @option{-mmmx}:
+@table @code
+@item v8qi __builtin_ia32_paddb (v8qi, v8qi)
+Generates the @code{paddb} machine instruction.
+@item v4hi __builtin_ia32_paddw (v4hi, v4hi)
+Generates the @code{paddw} machine instruction.
+@item v2si __builtin_ia32_paddd (v2si, v2si)
+Generates the @code{paddd} machine instruction.
+@item v8qi __builtin_ia32_psubb (v8qi, v8qi)
+Generates the @code{psubb} machine instruction.
+@item v4hi __builtin_ia32_psubw (v4hi, v4hi)
+Generates the @code{psubw} machine instruction.
+@item v2si __builtin_ia32_psubd (v2si, v2si)
+Generates the @code{psubd} machine instruction.
+
+@item v8qi __builtin_ia32_paddsb (v8qi, v8qi)
+Generates the @code{paddsb} machine instruction.
+@item v4hi __builtin_ia32_paddsw (v4hi, v4hi)
+Generates the @code{paddsw} machine instruction.
+@item v8qi __builtin_ia32_psubsb (v8qi, v8qi)
+Generates the @code{psubsb} machine instruction.
+@item v4hi __builtin_ia32_psubsw (v4hi, v4hi)
+Generates the @code{psubsw} machine instruction.
+
+@item v8qi __builtin_ia32_paddusb (v8qi, v8qi)
+Generates the @code{paddusb} machine instruction.
+@item v4hi __builtin_ia32_paddusw (v4hi, v4hi)
+Generates the @code{paddusw} machine instruction.
+@item v8qi __builtin_ia32_psubusb (v8qi, v8qi)
+Generates the @code{psubusb} machine instruction.
+@item v4hi __builtin_ia32_psubusw (v4hi, v4hi)
+Generates the @code{psubusw} machine instruction.
+
+@item v4hi __builtin_ia32_pmullw (v4hi, v4hi)
+Generates the @code{pmullw} machine instruction.
+@item v4hi __builtin_ia32_pmulhw (v4hi, v4hi)
+Generates the @code{pmulhw} machine instruction.
+
+@item di __builtin_ia32_pand (di, di)
+Generates the @code{pand} machine instruction.
+@item di __builtin_ia32_pandn (di,di)
+Generates the @code{pandn} machine instruction.
+@item di __builtin_ia32_por (di, di)
+Generates the @code{por} machine instruction.
+@item di __builtin_ia32_pxor (di, di)
+Generates the @code{pxor} machine instruction.
+
+@item v8qi __builtin_ia32_pcmpeqb (v8qi, v8qi)
+Generates the @code{pcmpeqb} machine instruction.
+@item v4hi __builtin_ia32_pcmpeqw (v4hi, v4hi)
+Generates the @code{pcmpeqw} machine instruction.
+@item v2si __builtin_ia32_pcmpeqd (v2si, v2si)
+Generates the @code{pcmpeqd} machine instruction.
+@item v8qi __builtin_ia32_pcmpgtb (v8qi, v8qi)
+Generates the @code{pcmpgtb} machine instruction.
+@item v4hi __builtin_ia32_pcmpgtw (v4hi, v4hi)
+Generates the @code{pcmpgtw} machine instruction.
+@item v2si __builtin_ia32_pcmpgtd (v2si, v2si)
+Generates the @code{pcmpgtd} machine instruction.
+
+@item v8qi __builtin_ia32_punpckhbw (v8qi, v8qi)
+Generates the @code{punpckhbw} machine instruction.
+@item v4hi __builtin_ia32_punpckhwd (v4hi, v4hi)
+Generates the @code{punpckhwd} machine instruction.
+@item v2si __builtin_ia32_punpckhdq (v2si, v2si)
+Generates the @code{punpckhdq} machine instruction.
+@item v8qi __builtin_ia32_punpcklbw (v8qi, v8qi)
+Generates the @code{punpcklbw} machine instruction.
+@item v4hi __builtin_ia32_punpcklwd (v4hi, v4hi)
+Generates the @code{punpcklwd} machine instruction.
+@item v2si __builtin_ia32_punpckldq (v2si, v2si)
+Generates the @code{punpckldq} machine instruction.
+
+@item v8qi __builtin_ia32_packsswb (v4hi, v4hi)
+Generates the @code{packsswb} machine instruction.
+@item v4hi __builtin_ia32_packssdw (v2si, v2si)
+Generates the @code{packssdw} machine instruction.
+@item v8qi __builtin_ia32_packuswb (v4hi, v4hi)
+Generates the @code{packuswb} machine instruction.
+
+@end table
+
+The following builtins are made available either with @option{-msse}, or
+with a combination of @option{-m3dnow} and @option{-march=athlon}.
+@table @code
+
+@item v4hi __builtin_ia32_pmulhuw (v4hi, v4hi)
+Generates the @code{pmulhuw} machine instruction.
+
+@item v8qi __builtin_ia32_pavgb (v8qi, v8qi)
+Generates the @code{pavgb} machine instruction.
+@item v4hi __builtin_ia32_pavgw (v4hi, v4hi)
+Generates the @code{pavgw} machine instruction.
+@item v4hi __builtin_ia32_psadbw (v8qi, v8qi)
+Generates the @code{psadbw} machine instruction.
+
+@item v8qi __builtin_ia32_pmaxub (v8qi, v8qi)
+Generates the @code{pmaxub} machine instruction.
+@item v4hi __builtin_ia32_pmaxsw (v4hi, v4hi)
+Generates the @code{pmaxsw} machine instruction.
+@item v8qi __builtin_ia32_pminub (v8qi, v8qi)
+Generates the @code{pminub} machine instruction.
+@item v4hi __builtin_ia32_pminsw (v4hi, v4hi)
+Generates the @code{pminsw} machine instruction.
+
+@item int __builtin_ia32_pextrw (v4hi, int)
+Generates the @code{pextrw} machine instruction.
+@item v4hi __builtin_ia32_pinsrw (v4hi, int, int)
+Generates the @code{pinsrw} machine instruction.
+
+@item int __builtin_ia32_pmovmskb (v8qi)
+Generates the @code{pmovmskb} machine instruction.
+@item void __builtin_ia32_maskmovq (v8qi, v8qi, char *)
+Generates the @code{maskmovq} machine instruction.
+@item void __buitlin_ia32_movntq (di *, di)
+Generates the @code{movntq} machine instruction.
+@item void __buitlin_ia32_sfence (void)
+Generates the @code{sfence} machine instruction.
+@item void __builtin_ia32_prefetch (char *, int selector)
+Generates a prefetch machine instruction, depending on the value of
+selector.  If @code{selector} is 0, it generates @code{prefetchnta}; for
+a value of 1, it generates @code{prefetcht0}; for a value of 2, it generates
+@code{prefetcht1}; and for a value of 3 it generates @code{prefetcht2}.
+
+@end table
+
+The following builtins are available when @option{-msse} is used.
+
+@table @code
+@item int __buitlin_ia32_comieq (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs an equality
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_comineq (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs an inequality
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_comilt (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs a ``less than''
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_comile (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs a ``less or
+equal'' comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_comigt (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs a ``greater than''
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_comige (v4sf, v4sf)
+Generates the @code{comiss} machine instruction and performs a ``greater or
+equal'' comparison.  The return value is the truth value of that comparison.
+
+@item int __buitlin_ia32_ucomieq (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs an equality
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_ucomineq (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs an inequality
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_ucomilt (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs a ``less than''
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_ucomile (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs a ``less or
+equal'' comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_ucomigt (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs a ``greater than''
+comparison.  The return value is the truth value of that comparison.
+@item int __buitlin_ia32_ucomige (v4sf, v4sf)
+Generates the @code{ucomiss} machine instruction and performs a ``greater or
+equal'' comparison.  The return value is the truth value of that comparison.
+
+@item v4sf __buitlin_ia32_addps (v4sf, v4sf)
+Generates the @code{addps} machine instruction.
+@item v4sf __buitlin_ia32_addss (v4sf, v4sf)
+Generates the @code{addss} machine instruction.
+@item v4sf __buitlin_ia32_subps (v4sf, v4sf)
+Generates the @code{subps} machine instruction.
+@item v4sf __buitlin_ia32_subss (v4sf, v4sf)
+Generates the @code{subss} machine instruction.
+@item v4sf __buitlin_ia32_mulps (v4sf, v4sf)
+Generates the @code{mulps} machine instruction.
+@item v4sf __buitlin_ia32_mulss (v4sf, v4sf)
+Generates the @code{mulss} machine instruction.
+@item v4sf __buitlin_ia32_divps (v4sf, v4sf)
+Generates the @code{divps} machine instruction.
+@item v4sf __buitlin_ia32_divss (v4sf, v4sf)
+Generates the @code{divss} machine instruction.
+
+@item v4si __buitlin_ia32_cmpeqps (v4sf, v4sf)
+Generates the @code{cmpeqps} machine instruction.
+@item v4si __buitlin_ia32_cmplts (v4sf, v4sf)
+Generates the @code{cmpltps} machine instruction.
+@item v4si __buitlin_ia32_cmpleps (v4sf, v4sf)
+Generates the @code{cmpleps} machine instruction.
+@item v4si __buitlin_ia32_cmpgtps (v4sf, v4sf)
+Generates the @code{cmpgtps} machine instruction.
+@item v4si __buitlin_ia32_cmpgeps (v4sf, v4sf)
+Generates the @code{cmpgeps} machine instruction.
+@item v4si __buitlin_ia32_cmpunordps (v4sf, v4sf)
+Generates the @code{cmpunodps} machine instruction.
+@item v4si __buitlin_ia32_cmpneqps (v4sf, v4sf)
+Generates the @code{cmpeqps} machine instruction.
+@item v4si __buitlin_ia32_cmpnltps (v4sf, v4sf)
+Generates the @code{cmpltps} machine instruction.
+@item v4si __buitlin_ia32_cmpnleps (v4sf, v4sf)
+Generates the @code{cmpleps} machine instruction.
+@item v4si __buitlin_ia32_cmpngtps (v4sf, v4sf)
+Generates the @code{cmpgtps} machine instruction.
+@item v4si __buitlin_ia32_cmpngeps (v4sf, v4sf)
+Generates the @code{cmpgeps} machine instruction.
+@item v4si __buitlin_ia32_cmpordps (v4sf, v4sf)
+Generates the @code{cmpunodps} machine instruction.
+
+@item v4si __buitlin_ia32_cmpeqss (v4sf, v4sf)
+Generates the @code{cmpeqss} machine instruction.
+@item v4si __buitlin_ia32_cmpltss (v4sf, v4sf)
+Generates the @code{cmpltss} machine instruction.
+@item v4si __buitlin_ia32_cmpless (v4sf, v4sf)
+Generates the @code{cmpless} machine instruction.
+@item v4si __buitlin_ia32_cmpgtss (v4sf, v4sf)
+Generates the @code{cmpgtss} machine instruction.
+@item v4si __buitlin_ia32_cmpgess (v4sf, v4sf)
+Generates the @code{cmpgess} machine instruction.
+@item v4si __buitlin_ia32_cmpunordss (v4sf, v4sf)
+Generates the @code{cmpunodss} machine instruction.
+@item v4si __buitlin_ia32_cmpneqss (v4sf, v4sf)
+Generates the @code{cmpeqss} machine instruction.
+@item v4si __buitlin_ia32_cmpnlts (v4sf, v4sf)
+Generates the @code{cmpltss} machine instruction.
+@item v4si __buitlin_ia32_cmpnless (v4sf, v4sf)
+Generates the @code{cmpless} machine instruction.
+@item v4si __buitlin_ia32_cmpngtss (v4sf, v4sf)
+Generates the @code{cmpgtss} machine instruction.
+@item v4si __buitlin_ia32_cmpngess (v4sf, v4sf)
+Generates the @code{cmpgess} machine instruction.
+@item v4si __buitlin_ia32_cmpordss (v4sf, v4sf)
+Generates the @code{cmpunodss} machine instruction.
+
+@item v4sf __buitlin_ia32_maxps (v4sf, v4sf)
+Generates the @code{maxps} machine instruction.
+@item v4sf __buitlin_ia32_maxsss (v4sf, v4sf)
+Generates the @code{maxss} machine instruction.
+@item v4sf __buitlin_ia32_minps (v4sf, v4sf)
+Generates the @code{minps} machine instruction.
+@item v4sf __buitlin_ia32_minsss (v4sf, v4sf)
+Generates the @code{minss} machine instruction.
+
+@item ti __buitlin_ia32_andps (ti, ti)
+Generates the @code{andps} machine instruction.
+@item ti __buitlin_ia32_andnps (ti, ti)
+Generates the @code{andnps} machine instruction.
+@item ti __buitlin_ia32_orps (ti, ti)
+Generates the @code{orps} machine instruction.
+@item ti __buitlin_ia32_xorps (ti, ti)
+Generates the @code{xorps} machine instruction.
+
+@item v4sf __buitlin_ia32_movps (v4sf, v4sf)
+Generates the @code{movps} machine instruction.
+@item v4sf __buitlin_ia32_movhlps (v4sf, v4sf)
+Generates the @code{movhlps} machine instruction.
+@item v4sf __buitlin_ia32_movlhps (v4sf, v4sf)
+Generates the @code{movlhps} machine instruction.
+@item v4sf __buitlin_ia32_unpckhps (v4sf, v4sf)
+Generates the @code{unpckhps} machine instruction.
+@item v4sf __buitlin_ia32_unpcklps (v4sf, v4sf)
+Generates the @code{unpcklps} machine instruction.
+
+@item v4sf __buitlin_ia32_cvtpi2ps (v4sf, v2si)
+Generates the @code{cvtpi2ps} machine instruction.
+@item v2si __buitlin_ia32_cvtps2pi (v4sf)
+Generates the @code{cvtps2pi} machine instruction.
+@item v4sf __buitlin_ia32_cvtsi2ss (v4sf, int)
+Generates the @code{cvtsi2ss} machine instruction.
+@item int __buitlin_ia32_cvtss2si (v4sf)
+Generates the @code{cvtsi2ss} machine instruction.
+@item v2si __buitlin_ia32_cvttps2pi (v4sf)
+Generates the @code{cvttps2pi} machine instruction.
+@item int __buitlin_ia32_cvttss2si (v4sf)
+Generates the @code{cvttsi2ss} machine instruction.
+
+@item v4sf __buitlin_ia32_rcpps (v4sf)
+Generates the @code{rcpps} machine instruction.
+@item v4sf __buitlin_ia32_rsqrtps (v4sf)
+Generates the @code{rsqrtps} machine instruction.
+@item v4sf __buitlin_ia32_sqrtps (v4sf)
+Generates the @code{sqrtps} machine instruction.
+@item v4sf __buitlin_ia32_rcpss (v4sf)
+Generates the @code{rcpss} machine instruction.
+@item v4sf __buitlin_ia32_rsqrtss (v4sf)
+Generates the @code{rsqrtss} machine instruction.
+@item v4sf __buitlin_ia32_sqrtss (v4sf)
+Generates the @code{sqrtss} machine instruction.
+
+@item v4sf __buitlin_ia32_shufps (v4sf, v4sf, int)
+Generates the @code{shufps} machine instruction.
+
+@item v4sf __buitlin_ia32_loadaps (float *)
+Generates the @code{movaps} machine instruction as a load from memory.
+@item void __buitlin_ia32_storeaps (float *, v4sf)
+Generates the @code{movaps} machine instruction as a store to memory.
+@item v4sf __buitlin_ia32_loadups (float *)
+Generates the @code{movups} machine instruction as a load from memory.
+@item void __buitlin_ia32_storeups (float *, v4sf)
+Generates the @code{movups} machine instruction as a store to memory.
+@item v4sf __buitlin_ia32_loadsss (float *)
+Generates the @code{movss} machine instruction as a load from memory.
+@item void __buitlin_ia32_storess (float *, v4sf)
+Generates the @code{movss} machine instruction as a store to memory.
+
+@item v4sf __buitlin_ia32_loadhps (v4sf, v2si *)
+Generates the @code{movhps} machine instruction as a load from memory.
+@item v4sf __buitlin_ia32_loadlps (v4sf, v2si *)
+Generates the @code{movlps} machine instruction as a load from memory
+@item void __buitlin_ia32_storehps (v4sf, v2si *)
+Generates the @code{movhps} machine instruction as a store to memory.
+@item void __buitlin_ia32_storelps (v4sf, v2si *)
+Generates the @code{movlps} machine instruction as a store to memory.
+
+@item void __buitlin_ia32_movntps (float *, v4sf)
+Generates the @code{movntps} machine instruction.
+@item int __buitlin_ia32_movmskps (v4sf)
+Generates the @code{movntps} machine instruction.
+
+@item void __buitlin_ia32_storeps1 (float *, v4sf)
+Generates the @code{movaps} machine instruction as a store to memory.
+Before storing, the value is modified with a @code{shufps} instruction
+so that the lowest of the four floating point elements is replicated
+across the entire vector that is stored.
+@item void __buitlin_ia32_storerps (float *, v4sf)
+Generates the @code{movaps} machine instruction as a store to memory.
+Before storing, the value is modified with a @code{shufps} instruction
+so that the order of the four floating point elements in the vector is
+reversed.
+@item v4sf __buitlin_ia32_loadps1 (float *)
+Generates a @code{movss} machine instruction to load a floating point
+value from memory, and a @code{shufps} instruction to replicate the
+loaded value across all four elements of the result vector.
+@item v4sf __buitlin_ia32_loadrps (float *)
+Generates a @code{movaps} machine instruction to load a vector from
+memory, and a @code{shufps} instruction to reverse the order of the
+four floating point elements in the result vector.
+@item v4sf __builtin_ia32_setps (float, float, float, float)
+Constructs a vector from four single floating point values.  The return
+value is equal to the value that would result from storing the four
+arguments into consecutive memory locations and then executing a
+@code{movaps} to load the vector from memory.
+@item v4sf __builtin_ia32_setps1 (float)
+Constructs a vector from a single floating point value by replicating
+it across all four elements of the result vector.
+@end table
+
@item -mpush-args
@itemx -mno-push-args
@opindex mpush-args