Discussion:
[edk2] Enable optimization for gcc x64 builds
Scott Duplichan
2014-10-29 04:59:26 UTC
Permalink
Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
builds, -Os (optimize for small code size) is used. Why is this? Apparently
it is because variable argument list handling fails when gcc X64 optimization
is enabled. The solution is an improvement to the patch of SVN rev 10440:
http://sourceforge.net/p/edk2/mailman/message/25121111/

The patch in this email only adds gcc X64 optimization for gcc versions 4.8
and newer. This is because testing with older versions of gcc is a lot of
work. On the other hand, the patch could be a lot simpler if it were to
ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and
gcc 4.9.1. For these two cases, the print formatting problem is resolved
by the patch.

Should we:
1) Restrict the change to recent gcc versions where testing is easy
(approach of included patch)
2) Apply the change to all gcc versions, and let older versions go
untested?
3) Try to find/build the needed older gcc versions so that the patch
can apply to all versions and be tested too

Thanks,
Scott
--
Enable optimization for gcc x64 builds.

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Scott Duplichan <***@notabs.org>

--

Index: BaseTools/Conf/tools_def.template
===================================================================
--- BaseTools/Conf/tools_def.template (revision 16254)
+++ BaseTools/Conf/tools_def.template (working copy)
@@ -3843,7 +3843,7 @@

DEFINE GCC44_ALL_CC_FLAGS = -g -fshort-wchar -fno-strict-aliasing -Wall -Werror -Wno-array-bounds -ffunction-sections -fdata-sections -c -include AutoGen.h -DSTRING_ARRAY_NAME=$(BASE_NAME)Strings
DEFINE GCC44_IA32_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m32 -malign-double -fno-stack-protector -D EFI32
-DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -DNO_BUILTIN_VA_FUNCS -mno-red-zone -Wno-address -mcmodel=large
+DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -mno-red-zone -Wno-address -mcmodel=large
DEFINE GCC44_IA32_X64_DLINK_COMMON = -nostdlib -n -q --gc-sections --script=$(EDK_TOOLS_PATH)/Scripts/gcc4.4-ld-script
DEFINE GCC44_IA32_X64_ASLDLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry ReferenceAcpiTable -u ReferenceAcpiTable
DEFINE GCC44_IA32_X64_DLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry $(IMAGE_ENTRY_POINT) -u $(IMAGE_ENTRY_POINT) -Map $(DEST_DIR_DEBUG)/$(BASE_NAME).map
@@ -4420,7 +4420,7 @@
*_GCC48_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC48_X64_ASLDLINK_FLAGS = DEF(GCC48_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC48_X64_ASM_FLAGS = DEF(GCC48_ASM_FLAGS) -m64
-*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS)
+*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS) -Os
*_GCC48_X64_DLINK_FLAGS = DEF(GCC48_X64_DLINK_FLAGS)
*_GCC48_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC48_X64_OBJCOPY_FLAGS =
@@ -4542,7 +4542,7 @@
*_GCC49_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC49_X64_ASLDLINK_FLAGS = DEF(GCC49_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC49_X64_ASM_FLAGS = DEF(GCC49_ASM_FLAGS) -m64
-*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS)
+*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS) -Os
*_GCC49_X64_DLINK_FLAGS = DEF(GCC49_X64_DLINK_FLAGS)
*_GCC49_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC49_X64_OBJCOPY_FLAGS =
Index: EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h
===================================================================
--- EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (revision 16254)
+++ EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (working copy)
@@ -66,6 +66,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -92,25 +93,37 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//

///
-/// Variable used to traverse the list of arguments. This type can vary by
-/// implementation and could be an array or structure.
+/// Variable used to traverse the list of arguments. This type can vary by
+/// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker, Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else

#ifndef VA_START
Index: MdePkg/Include/Base.h
===================================================================
--- MdePkg/Include/Base.h (revision 16254)
+++ MdePkg/Include/Base.h (working copy)
@@ -441,6 +441,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -472,7 +473,7 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//
@@ -481,16 +482,28 @@
/// Variable used to traverse the list of arguments. This type can vary by
/// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker, Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else
///
/// Variable used to traverse the list of arguments. This type can vary by
--
Bruce Cran
2014-10-29 15:52:05 UTC
Permalink
Post by Scott Duplichan
Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
builds, -Os (optimize for small code size) is used. Why is this? Apparently
it is because variable argument list handling fails when gcc X64 optimization
http://sourceforge.net/p/edk2/mailman/message/25121111/
Related to gcc flags, should we also enable -ffreestanding by default,
or at least -fno-builtin?
--
Bruce

------------------------------------------------------------------------------
Scott Duplichan
2014-10-29 22:12:31 UTC
Permalink
Bruce Cran [mailto:***@gmail.com] wrote:

]On Tue, Oct 28, 2014 at 10:59 PM, Scott Duplichan <***@notabs.org> wrote:
]> Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
]> builds, -Os (optimize for small code size) is used. Why is this? Apparently
]> it is because variable argument list handling fails when gcc X64 optimization
]> is enabled. The solution is an improvement to the patch of SVN rev 10440:
]> http://sourceforge.net/p/edk2/mailman/message/25121111/
]
]Related to gcc flags, should we also enable -ffreestanding by default,
]or at least -fno-builtin?

Those flags certainly seem appropriate for UEFI use. In fact, AARCH64, XCLANG,
XCODE5 already use -fno-builtin. But the reality is that most of the patches I
submit get rejected, even when there is a clear benefit. To even have a chance
of getting these options approved, there would have to be some easy to demonstrate
benefit. For example, maybe using these options can prevent gcc from replacing
copy loops in source code with a call to memcpy.

Thanks,
Scott

]--
]Bruce



------------------------------------------------------------------------------
Tim Lewis
2014-10-29 16:18:33 UTC
Permalink
Scott --

For historical perspective, the EDK2 build flags have focused on space over speed because of the code size constraints placed on flash-resident code. Not being as familiar with gcc as I am with VS20xx, I don't know whether these can be set together.

Tim

-----Original Message-----
From: Scott Duplichan [mailto:***@notabs.org]
Sent: Tuesday, October 28, 2014 9:59 PM
To: edk2-***@lists.sourceforge.net
Subject: [edk2] Enable optimization for gcc x64 builds

Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32 builds, -Os (optimize for small code size) is used. Why is this? Apparently it is because variable argument list handling fails when gcc X64 optimization is enabled. The solution is an improvement to the patch of SVN rev 10440:
http://sourceforge.net/p/edk2/mailman/message/25121111/

The patch in this email only adds gcc X64 optimization for gcc versions 4.8 and newer. This is because testing with older versions of gcc is a lot of work. On the other hand, the patch could be a lot simpler if it were to ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and gcc 4.9.1. For these two cases, the print formatting problem is resolved by the patch.

Should we:
1) Restrict the change to recent gcc versions where testing is easy
(approach of included patch)
2) Apply the change to all gcc versions, and let older versions go
untested?
3) Try to find/build the needed older gcc versions so that the patch
can apply to all versions and be tested too

Thanks,
Scott
--
Enable optimization for gcc x64 builds.

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Scott Duplichan <***@notabs.org>

--

Index: BaseTools/Conf/tools_def.template ===================================================================
--- BaseTools/Conf/tools_def.template (revision 16254)
+++ BaseTools/Conf/tools_def.template (working copy)
@@ -3843,7 +3843,7 @@

DEFINE GCC44_ALL_CC_FLAGS = -g -fshort-wchar -fno-strict-aliasing -Wall -Werror -Wno-array-bounds -ffunction-sections -fdata-sections -c -include AutoGen.h -DSTRING_ARRAY_NAME=$(BASE_NAME)Strings
DEFINE GCC44_IA32_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m32 -malign-double -fno-stack-protector -D EFI32
-DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -DNO_BUILTIN_VA_FUNCS -mno-red-zone -Wno-address -mcmodel=large
+DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -mno-red-zone -Wno-address -mcmodel=large
DEFINE GCC44_IA32_X64_DLINK_COMMON = -nostdlib -n -q --gc-sections --script=$(EDK_TOOLS_PATH)/Scripts/gcc4.4-ld-script
DEFINE GCC44_IA32_X64_ASLDLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry ReferenceAcpiTable -u ReferenceAcpiTable
DEFINE GCC44_IA32_X64_DLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry $(IMAGE_ENTRY_POINT) -u $(IMAGE_ENTRY_POINT) -Map $(DEST_DIR_DEBUG)/$(BASE_NAME).map
@@ -4420,7 +4420,7 @@
*_GCC48_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC48_X64_ASLDLINK_FLAGS = DEF(GCC48_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC48_X64_ASM_FLAGS = DEF(GCC48_ASM_FLAGS) -m64
-*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS)
+*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS) -Os
*_GCC48_X64_DLINK_FLAGS = DEF(GCC48_X64_DLINK_FLAGS)
*_GCC48_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC48_X64_OBJCOPY_FLAGS =
@@ -4542,7 +4542,7 @@
*_GCC49_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC49_X64_ASLDLINK_FLAGS = DEF(GCC49_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC49_X64_ASM_FLAGS = DEF(GCC49_ASM_FLAGS) -m64
-*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS)
+*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS) -Os
*_GCC49_X64_DLINK_FLAGS = DEF(GCC49_X64_DLINK_FLAGS)
*_GCC49_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC49_X64_OBJCOPY_FLAGS =
Index: EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h
===================================================================
--- EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (revision 16254)
+++ EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (working copy)
@@ -66,6 +66,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -92,25 +93,37 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//

///
-/// Variable used to traverse the list of arguments. This type can vary by -/// implementation and could be an array or structure.
+/// Variable used to traverse the list of arguments. This type can vary
+by /// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker,
+ Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker,
+ Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else

#ifndef VA_START
Index: MdePkg/Include/Base.h
===================================================================
--- MdePkg/Include/Base.h (revision 16254)
+++ MdePkg/Include/Base.h (working copy)
@@ -441,6 +441,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -472,7 +473,7 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//
@@ -481,16 +482,28 @@
/// Variable used to traverse the list of arguments. This type can vary by /// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker,
+ Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker,
+ Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else
///
/// Variable used to traverse the list of arguments. This type can vary by
--
------------------------------------------------------------------------------
Bruce Cran
2014-10-29 16:42:00 UTC
Permalink
Post by Tim Lewis
For historical perspective, the EDK2 build flags have focused on space over speed because of the code size constraints placed on flash-resident code. Not being as familiar with gcc as I am with VS20xx, I don't know whether these can be set together.
It looks like there are several places where -Os and -O2 are used:

GCC_ALL_CC_FLAGS: -Os
GCC_IA32_CC_FLAGS: -O2
*_GCC44_IA32_CC_FLAGS: -Os
*_GCC45_IA32_CC_FLAGS: -Os
*_GCC46_IA32_CC_FLAGS: -Os
*_GCC47_IA32_CC_FLAGS: -Os
*_GCC48_IA32_CC_FLAGS: -Os
*_GCC49_IA32_CC_FLAGS: -Os
*_ELFGCC_X64_CC_FLAGS: -Os

The XCODE and XCLANG definitions seem more consistent between IA32 and
X64 versions.
--
Bruce


------------------------------------------------------------------------------
Scott Duplichan
2014-10-29 19:12:14 UTC
Permalink
Tim Lewis [mailto:***@insyde.com] wrote:

]Scott --
]
]For historical perspective, the EDK2 build flags have focused on space over speed because of the code size constraints placed on ]flash-resident code. Not being as familiar with gcc as I am with VS20xx, I don't know whether these can be set together.
]
]Tim

Hello Tim,

I agree targeting small image size is a reasonable goal. Optimizations that trade
size for speed generally do not benefit UEFI or other boot code due to I/O overhead.
Though flash capacity has increased a lot lately, the access time can have a negative
effect on boot time. The current gcc x64 flags are not consistent with the goal of
small code size. This patch addresses that. For example, shell.efi x64 release build:

gcc 4.9.1 current 1295 kb
gcc 4.9.1 w/patch 1013 kb
VS2010 865 kb

What do you refer to by "set together"?

Thanks,
Scott


-----Original Message-----
From: Scott Duplichan [mailto:***@notabs.org]
Sent: Tuesday, October 28, 2014 9:59 PM
To: edk2-***@lists.sourceforge.net
Subject: [edk2] Enable optimization for gcc x64 builds

Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32 builds, -Os (optimize for small code size) is used. Why is this? Apparently it is because variable argument list handling fails when gcc X64 optimization is enabled. The solution is an improvement to the patch of SVN rev 10440:
http://sourceforge.net/p/edk2/mailman/message/25121111/

The patch in this email only adds gcc X64 optimization for gcc versions 4.8 and newer. This is because testing with older versions of gcc is a lot of work. On the other hand, the patch could be a lot simpler if it were to ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and gcc 4.9.1. For these two cases, the print formatting problem is resolved by the patch.

Should we:
1) Restrict the change to recent gcc versions where testing is easy
(approach of included patch)
2) Apply the change to all gcc versions, and let older versions go
untested?
3) Try to find/build the needed older gcc versions so that the patch
can apply to all versions and be tested too

Thanks,
Scott
--
Enable optimization for gcc x64 builds.

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Scott Duplichan <***@notabs.org>

--

Index: BaseTools/Conf/tools_def.template ===================================================================
--- BaseTools/Conf/tools_def.template (revision 16254)
+++ BaseTools/Conf/tools_def.template (working copy)
@@ -3843,7 +3843,7 @@

DEFINE GCC44_ALL_CC_FLAGS = -g -fshort-wchar -fno-strict-aliasing -Wall -Werror -Wno-array-bounds -ffunction-sections -fdata-sections -c -include AutoGen.h -DSTRING_ARRAY_NAME=$(BASE_NAME)Strings
DEFINE GCC44_IA32_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m32 -malign-double -fno-stack-protector -D EFI32
-DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -DNO_BUILTIN_VA_FUNCS -mno-red-zone -Wno-address -mcmodel=large
+DEFINE GCC44_X64_CC_FLAGS = DEF(GCC44_ALL_CC_FLAGS) -m64 -fno-stack-protector "-DEFIAPI=__attribute__((ms_abi))" -mno-red-zone -Wno-address -mcmodel=large
DEFINE GCC44_IA32_X64_DLINK_COMMON = -nostdlib -n -q --gc-sections --script=$(EDK_TOOLS_PATH)/Scripts/gcc4.4-ld-script
DEFINE GCC44_IA32_X64_ASLDLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry ReferenceAcpiTable -u ReferenceAcpiTable
DEFINE GCC44_IA32_X64_DLINK_FLAGS = DEF(GCC44_IA32_X64_DLINK_COMMON) --entry $(IMAGE_ENTRY_POINT) -u $(IMAGE_ENTRY_POINT) -Map $(DEST_DIR_DEBUG)/$(BASE_NAME).map
@@ -4420,7 +4420,7 @@
*_GCC48_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC48_X64_ASLDLINK_FLAGS = DEF(GCC48_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC48_X64_ASM_FLAGS = DEF(GCC48_ASM_FLAGS) -m64
-*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS)
+*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS) -Os
*_GCC48_X64_DLINK_FLAGS = DEF(GCC48_X64_DLINK_FLAGS)
*_GCC48_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC48_X64_OBJCOPY_FLAGS =
@@ -4542,7 +4542,7 @@
*_GCC49_X64_ASLCC_FLAGS = DEF(GCC_ASLCC_FLAGS) -m64
*_GCC49_X64_ASLDLINK_FLAGS = DEF(GCC49_IA32_X64_ASLDLINK_FLAGS) -m elf_x86_64
*_GCC49_X64_ASM_FLAGS = DEF(GCC49_ASM_FLAGS) -m64
-*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS)
+*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS) -Os
*_GCC49_X64_DLINK_FLAGS = DEF(GCC49_X64_DLINK_FLAGS)
*_GCC49_X64_RC_FLAGS = DEF(GCC_X64_RC_FLAGS)
*_GCC49_X64_OBJCOPY_FLAGS =
Index: EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h
===================================================================
--- EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (revision 16254)
+++ EdkCompatibilityPkg/Foundation/Include/EfiStdArg.h (working copy)
@@ -66,6 +66,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -92,25 +93,37 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//

///
-/// Variable used to traverse the list of arguments. This type can vary by -/// implementation and could be an array or structure.
+/// Variable used to traverse the list of arguments. This type can vary
+by /// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker,
+ Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker,
+ Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else

#ifndef VA_START
Index: MdePkg/Include/Base.h
===================================================================
--- MdePkg/Include/Base.h (revision 16254)
+++ MdePkg/Include/Base.h (working copy)
@@ -441,6 +441,7 @@
@return The aligned size.
**/
#define _INT_SIZE_OF(n) ((sizeof (n) + sizeof (UINTN) - 1) &~(sizeof (UINTN) - 1))
+#define GCC_VERSION (__GNUC__ * 10 + __GNUC_MINOR__)

#if defined(__CC_ARM)
//
@@ -472,7 +473,7 @@

#define VA_COPY(Dest, Start) __va_copy (Dest, Start)

-#elif defined(__GNUC__) && !defined(NO_BUILTIN_VA_FUNCS)
+#elif defined(__GNUC__) && !defined(__x86_64__)
//
// Use GCC built-in macros for variable argument lists.
//
@@ -481,16 +482,28 @@
/// Variable used to traverse the list of arguments. This type can vary by /// implementation and could be an array or structure.
///
-typedef __builtin_va_list VA_LIST;
+ typedef __builtin_va_list VA_LIST;

-#define VA_START(Marker, Parameter) __builtin_va_start (Marker, Parameter)
+ #define VA_START(Marker, Parameter) __builtin_va_start (Marker,
+ Parameter)

-#define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))

-#define VA_END(Marker) __builtin_va_end (Marker)
+ #define VA_END(Marker) __builtin_va_end (Marker)

-#define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)
+ #define VA_COPY(Dest, Start) __builtin_va_copy (Dest, Start)

+#elif defined(__GNUC__) && defined(__x86_64__) && (GCC_VERSION >= 48)
+
+ typedef __builtin_ms_va_list VA_LIST;
+
+ #define VA_START(Marker, Parameter) __builtin_ms_va_start (Marker,
+ Parameter)
+
+ #define VA_ARG(Marker, TYPE) ((sizeof (TYPE) < sizeof (UINTN)) ? (TYPE)(__builtin_va_arg (Marker, UINTN)) : (TYPE)(__builtin_va_arg (Marker, TYPE)))
+
+ #define VA_END(Marker) __builtin_ms_va_end (Marker)
+
+ #define VA_COPY(Dest, Start) __builtin_ms_va_copy (Dest, Start)
+
#else
///
/// Variable used to traverse the list of arguments. This type can vary by
--
Laszlo Ersek
2014-11-03 16:21:56 UTC
Permalink
Post by Tim Lewis
Scott --
For historical perspective, the EDK2 build flags have focused on
space over speed because of the code size constraints placed on
flash-resident code. Not being as familiar with gcc as I am with
VS20xx, I don't know whether these can be set together.
Maybe I'm misunderstanding (and in that case I apologize), but I'm under
the impression that you are misunderstanding '-Os'. '-Os' optimizes for
size, not speed. So Scott's patch seems to be aligned with the
tradition. (Currently no gcc optimizations are enabled at all when
building for X64, neither for speed nor for size.)

-Os Optimize for size. -Os enables all -O2 optimizations that
do not typically increase code size. It also performs
further optimizations designed to reduce code size.

-Os disables the following optimization flags:
-falign-functions -falign-jumps -falign-loops
-falign-labels -freorder-blocks
-freorder-blocks-and-partition -fprefetch-loop-arrays
-ftree-vect-loop-version

Again I'm sorry if I misunderstood you.

Thanks
Laszlo
Post by Tim Lewis
Tim
-----Original Message-----
Sent: Tuesday, October 28, 2014 9:59 PM
Subject: [edk2] Enable optimization for gcc x64 builds
Optimization is not enabled for x64 builds using gcc 4.4-4.9. For
IA32 builds, -Os (optimize for small code size) is used. Why is this?
Apparently it is because variable argument list handling fails when
gcc X64 optimization is enabled. The solution is an improvement to
http://sourceforge.net/p/edk2/mailman/message/25121111/
The patch in this email only adds gcc X64 optimization for gcc
versions 4.8 and newer. This is because testing with older versions
of gcc is a lot of work. On the other hand, the patch could be a lot
simpler if it were to ignore gcc version. The patch is boot tested
using Duet with gcc 4.8.2 and gcc 4.9.1. For these two cases, the
print formatting problem is resolved by the patch.
1) Restrict the change to recent gcc versions where testing is easy
(approach of included patch)
2) Apply the change to all gcc versions, and let older versions go
untested?
3) Try to find/build the needed older gcc versions so that the patch
can apply to all versions and be tested too
Thanks,
Scott
------------------------------------------------------------------------------
Bruce Cran
2014-11-03 19:54:38 UTC
Permalink
Post by Laszlo Ersek
So Scott's patch seems to be aligned with the
tradition. (Currently no gcc optimizations are enabled at all when
building for X64, neither for speed nor for size.)
Doesn't the following cause X64 builds to use -Os (and IA32 to use -O2,
if it wasn't overridden by -Os by later, version-specific flags)?

DEFINE GCC_ALL_CC_FLAGS = -g -Os -fshort-wchar
-fno-strict-aliasing -Wall -Werror -Wno-array-bounds -c -include AutoGen.h

DEFINE GCC_IA32_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -m32
-malign-double -freorder-blocks -freorder-blocks-and-partition -O2
-mno-stack-arg-probe

DEFINE GCC_X64_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -mno-red-zone
-Wno-address -mno-stack-arg-probeq
--
Bruce

------------------------------------------------------------------------------
Laszlo Ersek
2014-11-03 20:20:03 UTC
Permalink
Post by Bruce Cran
Post by Laszlo Ersek
So Scott's patch seems to be aligned with the
tradition. (Currently no gcc optimizations are enabled at all when
building for X64, neither for speed nor for size.)
Doesn't the following cause X64 builds to use -Os (and IA32 to use -O2,
if it wasn't overridden by -Os by later, version-specific flags)?
DEFINE GCC_ALL_CC_FLAGS = -g -Os -fshort-wchar
-fno-strict-aliasing -Wall -Werror -Wno-array-bounds -c -include AutoGen.h
DEFINE GCC_IA32_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -m32
-malign-double -freorder-blocks -freorder-blocks-and-partition -O2
-mno-stack-arg-probe
DEFINE GCC_X64_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -mno-red-zone
-Wno-address -mno-stack-arg-probeq
No; GCC_X64_CC_FLAGS is an "internal use" define. Its name doesn't
follow the

(DEBUG|RELEASE|NOOPT)_<TOOLCHAIN>_<ARCH>_CC_FLAGS

pattern, hence it has no direct effect. It only has an effect via
assignment to macros that *do* follow the above pattern, and such
assignments don't seem to exist (for the cases that we presently care
about):

*_GCC44_X64_CC_FLAGS = DEF(GCC44_X64_CC_FLAGS)
*_GCC45_X64_CC_FLAGS = DEF(GCC45_X64_CC_FLAGS)
*_GCC46_X64_CC_FLAGS = DEF(GCC46_X64_CC_FLAGS)
*_GCC47_X64_CC_FLAGS = DEF(GCC47_X64_CC_FLAGS)
*_GCC48_X64_CC_FLAGS = DEF(GCC48_X64_CC_FLAGS)
*_GCC49_X64_CC_FLAGS = DEF(GCC49_X64_CC_FLAGS)

The RHS macros all chain to GCC44_ALL_CC_FLAGS (recursively), which is
ultimately open-coded as:

DEFINE GCC44_ALL_CC_FLAGS = -g -fshort-wchar
-fno-strict-aliasing -Wall -Werror -Wno-array-bounds -ffunction-sections
-fdata-sections -c -include AutoGen.h
-DSTRING_ARRAY_NAME=$(BASE_NAME)Strings

In short, GCC_X64_CC_FLAGS is not used where it would matter (for this
case).

Thanks
Laszlo


------------------------------------------------------------------------------
Bruce Cran
2014-11-03 20:26:58 UTC
Permalink
Ah, thanks - I failed to trace it all the way through and just assumed
it *was* used by one of the BUILDTYPE_TOOLCHAIN_ARCH_CC_FLAGS lines.
--
Bruce
Post by Laszlo Ersek
No; GCC_X64_CC_FLAGS is an "internal use" define. Its name doesn't
follow the
(DEBUG|RELEASE|NOOPT)_<TOOLCHAIN>_<ARCH>_CC_FLAGS
pattern, hence it has no direct effect. It only has an effect via
assignment to macros that *do* follow the above pattern, and such
assignments don't seem to exist (for the cases that we presently care
------------------------------------------------------------------------------
Scott Duplichan
2014-11-03 20:43:32 UTC
Permalink
Bruce Cran [mailto:***@gmail.com] wrote:

]On 11/3/2014 9:21 AM, Laszlo Ersek wrote:
]
]> So Scott's patch seems to be aligned with the
]> tradition. (Currently no gcc optimizations are enabled at all when
]> building for X64, neither for speed nor for size.)
]
]Doesn't the following cause X64 builds to use -Os (and IA32 to use -O2,
]if it wasn't overridden by -Os by later, version-specific flags)?
]
]DEFINE GCC_ALL_CC_FLAGS = -g -Os -fshort-wchar
]-fno-strict-aliasing -Wall -Werror -Wno-array-bounds -c -include AutoGen.h
]
]DEFINE GCC_IA32_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -m32
]-malign-double -freorder-blocks -freorder-blocks-and-partition -O2
]-mno-stack-arg-probe
]
]DEFINE GCC_X64_CC_FLAGS = DEF(GCC_ALL_CC_FLAGS) -mno-red-zone
]-Wno-address -mno-stack-arg-probeq
]
]--
]Bruce

I believe you are right for the case of UNIXGCC and CYGGCC X64. ELFGCC
X64 also uses -Os. But I happened to be testing with GCC49 X64, and that
one doesn't use any -O flag. Reducing/consolidating the various gcc tool
chain definitions would be helpful.

Thanks,
Scott


------------------------------------------------------------------------------
Laszlo Ersek
2014-11-03 16:17:04 UTC
Permalink
Post by Scott Duplichan
Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
builds, -Os (optimize for small code size) is used. Why is this? Apparently
it is because variable argument list handling fails when gcc X64 optimization
http://sourceforge.net/p/edk2/mailman/message/25121111/
My reading of r10440 is different. As far as I understand,

(gcc-4.4, X64, stdarg builtins)

is simply a broken a combination, regardless of optimization.
Post by Scott Duplichan
The patch in this email only adds gcc X64 optimization for gcc versions 4.8
and newer.
What happens if you add -Os for gcc-4.8+ (X64) without touching
NO_BUILTIN_VA_FUNCS and the VA_* macros? Just curious.

What implies any connection between lack of -Os and VA_*?

Thanks!
Laszlo
Post by Scott Duplichan
This is because testing with older versions of gcc is a lot of
work. On the other hand, the patch could be a lot simpler if it were to
ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and
gcc 4.9.1. For these two cases, the print formatting problem is resolved
by the patch.
1) Restrict the change to recent gcc versions where testing is easy
(approach of included patch)
2) Apply the change to all gcc versions, and let older versions go
untested?
3) Try to find/build the needed older gcc versions so that the patch
can apply to all versions and be tested too
Thanks,
Scott
------------------------------------------------------------------------------
_______________________________________________
edk2-devel mailing list
https://lists.sourceforge.net/lists/listinfo/edk2-devel
------------------------------------------------------------------------------
Scott Duplichan
2014-11-03 20:24:00 UTC
Permalink
Laszlo Ersek [mailto:***@redhat.com] wrote:

]On 10/29/14 05:59, Scott Duplichan wrote:
]> Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
]> builds, -Os (optimize for small code size) is used. Why is this? Apparently
]> it is because variable argument list handling fails when gcc X64 optimization
]> is enabled. The solution is an improvement to the patch of SVN rev 10440:
]> http://sourceforge.net/p/edk2/mailman/message/25121111/
]
]My reading of r10440 is different. As far as I understand,
]
] (gcc-4.4, X64, stdarg builtins)
]
]is simply a broken a combination, regardless of optimization.

You are right about gcc X64 builds using the standard (native)
stdarg builtins. Without the original r10440 patch, a test using
Duet crashes early on. The exception handler dump has bogus values,
probably due to the same stdarg problem.

My point was why can't -Os be used for the current gcc X64 build like
it is for the IA32 build? Maybe r10440 is not relevant enough to even
be mentioned. What I found is adding -Os to the X64 Duet project
causes the 'g' (GUID) format to malfunction. There may be other
formatting problems, but this one is most obvious in the log file:

X64 Duet boot log from gcc X64 build (standard, correct):
WELCOME TO EFI WORLD!
InstallProtocolInterface: D2B2B828-0826-48A7-B3DF-983C006024F0 1FDF9D58
HOBLIST address in DXE = 0x1F3DA018
Memory Allocation 0x00000004 0x1FD69000 - 0x1FD88FFF
Memory Allocation 0x00000004 0x1F964000 - 0x1FD68FFF
Memory Allocation 0x00000003 0x1FDB9000 - 0x1FE0FFFF
FV Hob 0x1FE10000 - 0x1FFAFFFF
InstallProtocolInterface: D8117CFE-94A6-11D4-9A3A-0090273FC14D 1FDF2AC0
InstallProtocolInterface: 8F644FA9-E850-4DB1-9CE2-0B44698E8DA4 1F3D6A30
InstallProtocolInterface: 09576E91-6D3F-11D2-8E39-00A0C969723B 1F3D7818

X64 Duet boot log from gcc X64 build (with -Os added):
WELCOME TO EFI WORLD!
InstallProtocolInterface: 00000000-30300000-332D3030-2D30303030 1FE6F8C0
HOBLIST address in DXE = 0x1F461018
Memory Allocation 0x00000004 0x1FDF0000 - 0x1FE0FFFF
Memory Allocation 0x00000004 0x1F9EB000 - 0x1FDEFFFF
Memory Allocation 0x00000003 0x1FE40000 - 0x1FE7FFFF
FV Hob 0x1FE80000 - 0x1FFAFFFF
InstallProtocolInterface: 1F9EAB08-46312000-342D3830-2D30303030 1FE69070
InstallProtocolInterface: 00000001-30300200-332D3130-2D30303230 1F45DA30
InstallProtocolInterface: 00000001-30308FA1-332D3130-2D31414630 1F45E818

If "-Os -mabi=ms" is used for the gcc X64 build, then the pre-r10440
method (using the native stdarg builtins) works. But that is just hiding
the problem.

The __builtin_ms_va_* macros for cross ABI use are not well documented
as far as I can find. File cross-stdarg.h is about it. But they have been
around for a long time, at least since gcc 4.4.

Thanks,
scott


]> The patch in this email only adds gcc X64 optimization for gcc versions 4.8
]> and newer.
]
]What happens if you add -Os for gcc-4.8+ (X64) without touching
]NO_BUILTIN_VA_FUNCS and the VA_* macros? Just curious.

See 'g' formatting problem above.

]What implies any connection between lack of -Os and VA_*?

The fact that -Os is used throughout edk2, except for the one
case where VA_* prevents it.

]Thanks!
]Laszlo
]
]> This is because testing with older versions of gcc is a lot of
]> work. On the other hand, the patch could be a lot simpler if it were to
]> ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and
]> gcc 4.9.1. For these two cases, the print formatting problem is resolved
]> by the patch.
]>
]> Should we:
]> 1) Restrict the change to recent gcc versions where testing is easy
]> (approach of included patch)
]> 2) Apply the change to all gcc versions, and let older versions go
]> untested?
]> 3) Try to find/build the needed older gcc versions so that the patch
]> can apply to all versions and be tested too
]>
]> Thanks,
]> Scott



------------------------------------------------------------------------------
Andrew Fish
2014-11-03 21:05:33 UTC
Permalink
Post by Scott Duplichan
]> Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
]> builds, -Os (optimize for small code size) is used. Why is this? Apparently
]> it is because variable argument list handling fails when gcc X64 optimization
]> http://sourceforge.net/p/edk2/mailman/message/25121111/ <http://sourceforge.net/p/edk2/mailman/message/25121111/>
]
]My reading of r10440 is different. As far as I understand,
]
] (gcc-4.4, X64, stdarg builtins)
]
]is simply a broken a combination, regardless of optimization.
You are right about gcc X64 builds using the standard (native)
stdarg builtins. Without the original r10440 patch, a test using
Duet crashes early on. The exception handler dump has bogus values,
probably due to the same stdarg problem.
My point was why can't -Os be used for the current gcc X64 build like
it is for the IA32 build? Maybe r10440 is not relevant enough to even
be mentioned. What I found is adding -Os to the X64 Duet project
causes the 'g' (GUID) format to malfunction. There may be other
WELCOME TO EFI WORLD!
InstallProtocolInterface: D2B2B828-0826-48A7-B3DF-983C006024F0 1FDF9D58
HOBLIST address in DXE = 0x1F3DA018
Memory Allocation 0x00000004 0x1FD69000 - 0x1FD88FFF
Memory Allocation 0x00000004 0x1F964000 - 0x1FD68FFF
Memory Allocation 0x00000003 0x1FDB9000 - 0x1FE0FFFF
FV Hob 0x1FE10000 - 0x1FFAFFFF
InstallProtocolInterface: D8117CFE-94A6-11D4-9A3A-0090273FC14D 1FDF2AC0
InstallProtocolInterface: 8F644FA9-E850-4DB1-9CE2-0B44698E8DA4 1F3D6A30
InstallProtocolInterface: 09576E91-6D3F-11D2-8E39-00A0C969723B 1F3D7818
WELCOME TO EFI WORLD!
InstallProtocolInterface: 00000000-30300000-332D3030-2D30303030 1FE6F8C0
HOBLIST address in DXE = 0x1F461018
Memory Allocation 0x00000004 0x1FDF0000 - 0x1FE0FFFF
Memory Allocation 0x00000004 0x1F9EB000 - 0x1FDEFFFF
Memory Allocation 0x00000003 0x1FE40000 - 0x1FE7FFFF
FV Hob 0x1FE80000 - 0x1FFAFFFF
InstallProtocolInterface: 1F9EAB08-46312000-342D3830-2D30303030 1FE69070
InstallProtocolInterface: 00000001-30300200-332D3130-2D30303230 1F45DA30
InstallProtocolInterface: 00000001-30308FA1-332D3130-2D31414630 1F45E818
If "-Os -mabi=ms" is used for the gcc X64 build, then the pre-r10440
method (using the native stdarg builtins) works. But that is just hiding
the problem.
The __builtin_ms_va_* macros for cross ABI use are not well documented
as far as I can find. File cross-stdarg.h is about it. But they have been
around for a long time, at least since gcc 4.4.
It seems if you make EFI VA_LIST point to __buitin_ms_va__* then you need to decorate any function using VA_LIST with EFIAPI to make sure the code gen, calling, and va_list all match up?

The EFI rules are documented here: http://msdn.microsoft.com/en-us/library/9b372w95.aspx

From debugging problems like this in the past you can usually figure it out from the assembly.
The EFI/VC++ rules are very simple, like passing parameters, and the marker is a pointer to a stack frame looking thing. The Unix version is much more complicated and the marker is sometimes a data structure. The rules in Unix for floating point are very complex so you tend to see more code overhead in the Unix flow.

In the following example I compiled it 1st for Unix and then for EFI calling convention. Note the added complexity in the p() function assembly introduced by the more complex Unix rules. Also note that Unix passes 6 values in registers, and EFI/VC++ is just 4 registers, and the order of the registers are different.

When we were adding the x86_64-pc-win32-macho target to clang we found a few places where the compiler emitted the wrong from of the var args. So we tracked it down by looking at the assembly, then we built a simple stand alone case to file a bug against the compiler.

~/work/Compiler>cat va.c

#include <stdarg.h>

int printf (const char *, ...);

void
p2 (int a, __builtin_va_list *valist)
{
int i;

for (i=0; i <a; i++) {
printf ("%d\n", __builtin_va_arg (*valist, int));
}
}


void
p (int a, ...)
{
__builtin_va_list valist;

__builtin_va_start (valist, a);
p2 (a, &valist);
__builtin_va_end (valist);
}

int
main ()
{
p (7 , 1, 2, 3, 4, 5, 6, 7);
return 0;
}


~/work/Compiler>clang -S -Os va.c
~/work/Compiler>cat va.S
.section __TEXT,__text,regular,pure_instructions
.globl _p2
_p2: ## @p2
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %rbx
pushq %rax
Ltmp6:
.cfi_offset %rbx, -40
Ltmp7:
.cfi_offset %r14, -32
Ltmp8:
.cfi_offset %r15, -24
movq %rsi, %r14
movl %edi, %ebx
testl %ebx, %ebx
jle LBB0_6
## BB#1: ## %.lr.ph
leaq L_.str(%rip), %r15
LBB0_2: ## =>This Inner Loop Header: Depth=1
movslq (%r14), %rcx
cmpq $40, %rcx
ja LBB0_4
## BB#3: ## in Loop: Header=BB0_2 Depth=1
movq %rcx, %rax
addq 16(%r14), %rax
leal 8(%rcx), %ecx
movl %ecx, (%r14)
jmp LBB0_5
LBB0_4: ## in Loop: Header=BB0_2 Depth=1
movq 8(%r14), %rax
leaq 8(%rax), %rcx
movq %rcx, 8(%r14)
LBB0_5: ## in Loop: Header=BB0_2 Depth=1
movl (%rax), %esi
xorl %eax, %eax
movq %r15, %rdi
callq _printf
decl %ebx
jne LBB0_2
LBB0_6: ## %._crit_edge
addq $8, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
retq
.cfi_endproc

.globl _p
_p: ## @p
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp12:
.cfi_def_cfa_offset 16
Ltmp13:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp14:
.cfi_def_cfa_register %rbp
pushq %rbx
subq $216, %rsp
Ltmp15:
.cfi_offset %rbx, -24
testb %al, %al
je LBB1_2
## BB#1:
movaps %xmm0, -176(%rbp)
movaps %xmm1, -160(%rbp)
movaps %xmm2, -144(%rbp)
movaps %xmm3, -128(%rbp)
movaps %xmm4, -112(%rbp)
movaps %xmm5, -96(%rbp)
movaps %xmm6, -80(%rbp)
movaps %xmm7, -64(%rbp)
LBB1_2:
movq %r9, -184(%rbp)
movq %r8, -192(%rbp)
movq %rcx, -200(%rbp)
movq %rdx, -208(%rbp)
movq %rsi, -216(%rbp)
movq ***@GOTPCREL(%rip), %rbx
movq (%rbx), %rax
movq %rax, -16(%rbp)
leaq -224(%rbp), %rax
movq %rax, -32(%rbp)
leaq 16(%rbp), %rax
movq %rax, -40(%rbp)
movl $48, -44(%rbp)
movl $8, -48(%rbp)
leaq -48(%rbp), %rsi
callq _p2
movq (%rbx), %rax
cmpq -16(%rbp), %rax
jne LBB1_4
## BB#3:
addq $216, %rsp
popq %rbx
popq %rbp
retq
LBB1_4:
callq ___stack_chk_fail
.cfi_endproc

.globl _main
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp18:
.cfi_def_cfa_offset 16
Ltmp19:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp20:
.cfi_def_cfa_register %rbp
subq $16, %rsp
movl $2, %edx
movl $3, %ecx
xorl %eax, %eax
movl $7, 8(%rsp)
movl $6, (%rsp)
movl $7, %edi
movl $1, %esi
movl $4, %r8d
movl $5, %r9d
callq _p
xorl %eax, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc

.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "%d\n"

.subsections_via_symbols
~/work/Compiler>clang -S -Os va.c -target x86_64-pc-win32-macho
~/work/Compiler>cat va.S
.section __TEXT,__text,regular,pure_instructions
.globl _p2
_p2: ## @p2
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %rsi
pushq %rdi
pushq %rbx
subq $40, %rsp
Ltmp6:
.cfi_offset %rbx, -40
Ltmp7:
.cfi_offset %rdi, -32
Ltmp8:
.cfi_offset %rsi, -24
movq %rdx, %rsi
movl %ecx, %edi
testl %edi, %edi
jle LBB0_3
## BB#1: ## %.lr.ph
leaq L_.str(%rip), %rbx
LBB0_2: ## =>This Inner Loop Header: Depth=1
movq (%rsi), %rax
leaq 8(%rax), %rcx
movq %rcx, (%rsi)
movl (%rax), %edx
movq %rbx, %rcx
callq _printf
decl %edi
jne LBB0_2
LBB0_3: ## %._crit_edge
addq $40, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %rbp
retq
.cfi_endproc

.globl _p
_p: ## @p
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp12:
.cfi_def_cfa_offset 16
Ltmp13:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp14:
.cfi_def_cfa_register %rbp
pushq %rsi
pushq %rdi
subq $64, %rsp
Ltmp15:
.cfi_offset %rdi, -32
Ltmp16:
.cfi_offset %rsi, -24
movl %ecx, %esi
movq %r9, 40(%rbp)
movq %r8, 32(%rbp)
movq %rdx, 24(%rbp)
leaq 24(%rbp), %rax
movq %rax, -48(%rbp)
testl %esi, %esi
jle LBB1_3
## BB#1: ## %.lr.ph.i
leaq L_.str(%rip), %rdi
LBB1_2: ## =>This Inner Loop Header: Depth=1
movq -48(%rbp), %rax
leaq 8(%rax), %rcx
movq %rcx, -48(%rbp)
movl (%rax), %edx
movq %rdi, %rcx
callq _printf
decl %esi
jne LBB1_2
LBB1_3: ## %p2.exit
addq $64, %rsp
popq %rdi
popq %rsi
popq %rbp
retq
.cfi_endproc

.globl _main
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp19:
.cfi_def_cfa_offset 16
Ltmp20:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp21:
.cfi_def_cfa_register %rbp
subq $64, %rsp
movl $7, 56(%rsp)
movl $6, 48(%rsp)
movl $5, 40(%rsp)
movl $4, 32(%rsp)
movl $7, %ecx
movl $1, %edx
movl $2, %r8d
movl $3, %r9d
callq _p
xorl %eax, %eax
addq $64, %rsp
popq %rbp
retq
.cfi_endproc

.section __TEXT,__cstring,cstring_literals
L_.str: ## @.str
.asciz "%d\n"


.subsections_via_symbols
Post by Scott Duplichan
Thanks,
scott
Laszlo Ersek
2014-11-04 13:48:04 UTC
Permalink
Post by Scott Duplichan
]> Optimization is not enabled for x64 builds using gcc 4.4-4.9. For IA32
]> builds, -Os (optimize for small code size) is used. Why is this? Apparently
]> it is because variable argument list handling fails when gcc X64 optimization
]> http://sourceforge.net/p/edk2/mailman/message/25121111/
]
]My reading of r10440 is different. As far as I understand,
]
] (gcc-4.4, X64, stdarg builtins)
]
]is simply a broken a combination, regardless of optimization.
You are right about gcc X64 builds using the standard (native)
stdarg builtins. Without the original r10440 patch, a test using
Duet crashes early on. The exception handler dump has bogus values,
probably due to the same stdarg problem.
My point was why can't -Os be used for the current gcc X64 build like
it is for the IA32 build? Maybe r10440 is not relevant enough to even
be mentioned. What I found is adding -Os to the X64 Duet project
causes the 'g' (GUID) format to malfunction. There may be other
WELCOME TO EFI WORLD!
InstallProtocolInterface: D2B2B828-0826-48A7-B3DF-983C006024F0 1FDF9D58
HOBLIST address in DXE = 0x1F3DA018
Memory Allocation 0x00000004 0x1FD69000 - 0x1FD88FFF
Memory Allocation 0x00000004 0x1F964000 - 0x1FD68FFF
Memory Allocation 0x00000003 0x1FDB9000 - 0x1FE0FFFF
FV Hob 0x1FE10000 - 0x1FFAFFFF
InstallProtocolInterface: D8117CFE-94A6-11D4-9A3A-0090273FC14D 1FDF2AC0
InstallProtocolInterface: 8F644FA9-E850-4DB1-9CE2-0B44698E8DA4 1F3D6A30
InstallProtocolInterface: 09576E91-6D3F-11D2-8E39-00A0C969723B 1F3D7818
WELCOME TO EFI WORLD!
InstallProtocolInterface: 00000000-30300000-332D3030-2D30303030 1FE6F8C0
HOBLIST address in DXE = 0x1F461018
Memory Allocation 0x00000004 0x1FDF0000 - 0x1FE0FFFF
Memory Allocation 0x00000004 0x1F9EB000 - 0x1FDEFFFF
Memory Allocation 0x00000003 0x1FE40000 - 0x1FE7FFFF
FV Hob 0x1FE80000 - 0x1FFAFFFF
InstallProtocolInterface: 1F9EAB08-46312000-342D3830-2D30303030 1FE69070
InstallProtocolInterface: 00000001-30300200-332D3130-2D30303230 1F45DA30
InstallProtocolInterface: 00000001-30308FA1-332D3130-2D31414630 1F45E818
Indeed I can reproduce this problem with OVMF (gcc-4.8, X64 target).

Here's what I did.

First, I tried to figure out what optimization or target-specific
setting incurred by -Os triggers the behavior. I used

gcc -Q -O<X> --help=(target|optimizers|...)

and compared the results between X=0 and X=s (see the gcc manual for
what the above command does). I then tried to reproduce the bug by
manually specifying the -fZZZ flags reported by the above command.
Unfortunately, that doesn't reproduce the problem -- "-Os" does
something that is unavailable with *any other* command line flags. (In
other words, it's not possible to reproduce the behavior of -Os just by
replacing -Os with a bunch of -fZZZ and -mYYY flags.)

Second, I located the exact failure site. It is in
BasePrintLibSPrintMarker() -- the main formatter function --, in file
"MdePkg/Library/BasePrintLib/PrintLibInternal.c". The code in question
handles the 'g' case (GUID printing).

(I didn't use '-Os' globally; I only added

[BuildOptions]
GCC:DEBUG_GCC48_X64_CC_FLAGS = -Os

to "MdePkg/Library/BasePrintLib/BasePrintLib.inf", and that's sufficient
to trigger the problem.)

This of course raises the question why other conversion specifiers work
okay in the same function.

Note that BasePrintLibSPrintMarker() is recursive, for at least the
conversion specifiers 't' (TIME*), 'g' (GUID*), 'r' (RETURN_STATUS):

BasePrintLibSPrint()
BasePrintLibSPrintMarker()
// called for 't', 'g', 'r':
BasePrintLibSPrint()
BasePrintLibSPrintMarker()

However, %r works just fine. (I didn't test %t.) This implies that the
recursion per se is not a problem.

Next, I *trimmed* the format string (and correspondingly, the argument
list) of the following call (invoked under 'g'):

BasePrintLibSPrint (
ValueBuffer,
MAXIMUM_VALUE_CHARACTERS,
0,
"%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
GuidData1,
GuidData2,
GuidData3,
TmpGuid->Data4[0],
TmpGuid->Data4[1],
TmpGuid->Data4[2],
TmpGuid->Data4[3],
TmpGuid->Data4[4],
TmpGuid->Data4[5],
TmpGuid->Data4[6],
TmpGuid->Data4[7]
);

Surprise, when I cut off the arguments starting with GuidData3 (ie. the
last argument that I preserve is GuidData2), then things just work, even
with -Os! But, if I pass GuidData3 too, then the output falls apart.


I'm attaching two files (two disassemblies produced with "objdump -S
.../PrintLibInternal.obj"). I can't exactly figure out what's wrong, but
it looks quite like a gcc bug to me.

Namely, as long as the last argument passed via the ellipsis (...) is
GuidData2 (six arguments in total), then the marker itself (ie. the
VaListMarker parameter of the BasePrintLibSPrintMarker() function, of
type VA_LIST (== CHAR8*)) is kept on the *stack*. And, the macro
application under 'g':

TmpGuid = VA_ARG (VaListMarker, GUID *);

(
#define VA_ARG(Marker, TYPE) (*(TYPE *) ((Marker += _INT_SIZE_OF
(TYPE)) - _INT_SIZE_OF (TYPE)))
)

gets translated to (see disas.good):

- 6b2: 48 8b 44 24 40 mov 0x40(%rsp),%rax
- 6b7: 4c 8b 28 mov (%rax),%r13
- 6ba: 48 83 c0 08 add $0x8,%rax
- 6be: 48 89 44 24 40 mov %rax,0x40(%rsp)

ie.:
* read VaListMarker from the stack into %rax -- %rax now points at the
vararg
* dereference %rax (for fetching the value of the pointer-to-GUID),
* increment the value of %rax (so that now it points to the next vararg)
* write back %rax to the stack (so that VaListMarker actually sees
the increment)

and things work.

However, when GuidData3 is passed in addition (seven or more arguments),
then that triggers two things in gcc's -Os optimizer:

(a) The above VA_ARG() macro invocation gets translated to:

+ 670: 4d 8b 27 mov (%r15),%r12
+ 673: 49 83 c7 08 add $0x8,%r15

which means that VaListMarker is kept in a register only: %r15. See
disas.bad.

(This is consistent with the BasePrintLibSPrintMarker() function's
prologue, as it is *not* EFIAPI, hence the fifth parameter,
VaListMarker, arrives in %r8, and that's copied to %r15 in the prologue:

+ 2: 4d 89 c7 mov %r8,%r15
).

(b) Plus, the BasePrintLibSPrintMarker() --> BasePrintLibSPrint() -->
BasePrintLibSPrintMarker() call chain seems to be inlined / turned into
jumps (instead of calls).


These two in combination make me think that VaListMarker (kept only in
%r15) is clobbered in the course of the recursion.


The following patch works around the issue. It introduces a *volatile*
pointer to the BasePrintLibSPrint() function. Note, it's not the
pointed-to function that is volatile (that would make no sense); the
function pointer itself is volatile.

So, when we call the BasePrintLibSPrint() function through the pointer,
gcc can't avoid *reading* the function pointer, and that enforces a real
call instruction, and the recursive calls are not inlined / implemented
with jumps. (And VaListMarker is again kept on the stack, 0x90(%rsp)).

----------------
diff --git a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
index 8dc5ec7..fbb3726 100644
--- a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
+++ b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
@@ -680,10 +680,20 @@ BasePrintLibSPrintMarker (
if (TmpGuid == NULL) {
ArgumentString = "<null guid>";
} else {
+ UINTN (EFIAPI * volatile PrintFunction) (
+ OUT CHAR8 *StartOfBuffer,
+ IN UINTN BufferSize,
+ IN UINTN Flags,
+ IN CONST CHAR8 *FormatString,
+ ...
+ );
+
+ PrintFunction = BasePrintLibSPrint;
+
GuidData1 = ReadUnaligned32 (&(TmpGuid->Data1));
GuidData2 = ReadUnaligned16 (&(TmpGuid->Data2));
GuidData3 = ReadUnaligned16 (&(TmpGuid->Data3));
- BasePrintLibSPrint (
+ PrintFunction (
ValueBuffer,
MAXIMUM_VALUE_CHARACTERS,
0,
----------------

With this patch, GUIDs are printed okay with -Os.

(Of course it's not edk2 that needs to be fixed.)

Thanks
Laszlo
Post by Scott Duplichan
If "-Os -mabi=ms" is used for the gcc X64 build, then the pre-r10440
method (using the native stdarg builtins) works. But that is just hiding
the problem.
The __builtin_ms_va_* macros for cross ABI use are not well documented
as far as I can find. File cross-stdarg.h is about it. But they have been
around for a long time, at least since gcc 4.4.
Thanks,
scott
]> The patch in this email only adds gcc X64 optimization for gcc versions 4.8
]> and newer.
]
]What happens if you add -Os for gcc-4.8+ (X64) without touching
]NO_BUILTIN_VA_FUNCS and the VA_* macros? Just curious.
See 'g' formatting problem above.
]What implies any connection between lack of -Os and VA_*?
The fact that -Os is used throughout edk2, except for the one
case where VA_* prevents it.
]Thanks!
]Laszlo
]
]> This is because testing with older versions of gcc is a lot of
]> work. On the other hand, the patch could be a lot simpler if it were to
]> ignore gcc version. The patch is boot tested using Duet with gcc 4.8.2 and
]> gcc 4.9.1. For these two cases, the print formatting problem is resolved
]> by the patch.
]>
]> 1) Restrict the change to recent gcc versions where testing is easy
]> (approach of included patch)
]> 2) Apply the change to all gcc versions, and let older versions go
]> untested?
]> 3) Try to find/build the needed older gcc versions so that the patch
]> can apply to all versions and be tested too
]>
]> Thanks,
]> Scott
------------------------------------------------------------------------------
_______________________________________________
edk2-devel mailing list
https://lists.sourceforge.net/lists/listinfo/edk2-devel
Andrew Fish
2014-11-04 17:28:32 UTC
Permalink
Post by Laszlo Ersek
diff --git a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
index 8dc5ec7..fbb3726 100644
--- a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
+++ b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
@@ -680,10 +680,20 @@ BasePrintLibSPrintMarker (
if (TmpGuid == NULL) {
ArgumentString = "<null guid>";
} else {
+ UINTN (EFIAPI * volatile PrintFunction) (
+ OUT CHAR8 *StartOfBuffer,
+ IN UINTN BufferSize,
+ IN UINTN Flags,
+ IN CONST CHAR8 *FormatString,
+ ...
+ );
+
+ PrintFunction = BasePrintLibSPrint;
+
GuidData1 = ReadUnaligned32 (&(TmpGuid->Data1));
GuidData2 = ReadUnaligned16 (&(TmpGuid->Data2));
GuidData3 = ReadUnaligned16 (&(TmpGuid->Data3));
- BasePrintLibSPrint (
+ PrintFunction (
ValueBuffer,
MAXIMUM_VALUE_CHARACTERS,
0,
----------------
With this patch, GUIDs are printed okay with -Os.
(Of course it's not edk2 that needs to be fixed.)
But pedantically you need change the definition of BasePrintLibSPrint() to include EFIAPI.

If you look at BasePrintLibSPrintMarker() (and some of the other routines) you will notice a BASE_LIST and a VA_LIST. We had an API that exposed a VA_LIST (well code that was casting to VA_LIST) in the report status code stack and it forced use to use our own made up BASE_LIST concept to get it to work. I think you are going to hit similar issues mixing calling conventions.

So my 1st question is why do you need to mix calling conventions, and depend on EFIAPI for interoperability. Why not just change the ABI on all functions?

Problems with the mixed calling convention:
1) All assembly routines must be marked as EFIAPI, or the C code will generate the wrong calling convention. Not an issue in the MdePkg, but potentially an issue in other packages.
2) The var arg chain needs to be constant. I think for i386 you get lucky and the calling conventions are close enough it kind of works, but for X64 they are very different. Even if you force VA_LIST to be the Microsoft one, it is not clear to me that forces the compiler to treat every … the Microsoft way.

Thanks,

Andrew Fish
Laszlo Ersek
2014-11-04 19:31:01 UTC
Permalink
Post by Andrew Fish
Post by Laszlo Ersek
diff --git a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
index 8dc5ec7..fbb3726 100644
--- a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
+++ b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
@@ -680,10 +680,20 @@ BasePrintLibSPrintMarker (
if (TmpGuid == NULL) {
ArgumentString = "<null guid>";
} else {
+ UINTN (EFIAPI * volatile PrintFunction) (
+ OUT CHAR8 *StartOfBuffer,
+ IN UINTN BufferSize,
+ IN UINTN Flags,
+ IN CONST CHAR8 *FormatString,
+ ...
+ );
+
+ PrintFunction = BasePrintLibSPrint;
+
GuidData1 = ReadUnaligned32 (&(TmpGuid->Data1));
GuidData2 = ReadUnaligned16 (&(TmpGuid->Data2));
GuidData3 = ReadUnaligned16 (&(TmpGuid->Data3));
- BasePrintLibSPrint (
+ PrintFunction (
ValueBuffer,
MAXIMUM_VALUE_CHARACTERS,
0,
----------------
With this patch, GUIDs are printed okay with -Os.
(Of course it's not edk2 that needs to be fixed.)
But pedantically you need change the definition of BasePrintLibSPrint() to include EFIAPI.
I tried that (without applying above patch, before posting my message);
it didn't help.

Thanks,
Laszlo
Post by Andrew Fish
If you look at BasePrintLibSPrintMarker() (and some of the other
routines) you will notice a BASE_LIST and a VA_LIST. We had an API that
exposed a VA_LIST (well code that was casting to VA_LIST) in the report
status code stack and it forced use to use our own made up BASE_LIST
concept to get it to work. I think you are going to hit similar issues
mixing calling conventions.
So my 1st question is why do you need to mix calling conventions, and
depend on EFIAPI for interoperability. Why not just change the ABI on
all functions?
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
2) The var arg chain needs to be constant. I think for i386 you get
lucky and the calling conventions are close enough it kind of works, but
for X64 they are very different. Even if you force VA_LIST to be the
Microsoft one, it is not clear to me that forces the compiler to treat
every … the Microsoft way.
Thanks,
Andrew Fish
------------------------------------------------------------------------------
_______________________________________________
edk2-devel mailing list
https://lists.sourceforge.net/lists/listinfo/edk2-devel
------------------------------------------------------------------------------
Scott Duplichan
2014-11-04 20:15:36 UTC
Permalink
Andrew Fish [mailto:***@apple.com] wrote:


Sent: Tuesday, November 04, 2014 11:29 AM
To: edk2-***@lists.sourceforge.net
Cc: Paolo Bonzini
Subject: Re: [edk2] Enable optimization for gcc x64 builds





On Nov 4, 2014, at 5:48 AM, Laszlo Ersek <***@redhat.com <mailto:***@redhat.com> > wrote:



diff --git a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
index 8dc5ec7..fbb3726 100644
--- a/MdePkg/Library/BasePrintLib/PrintLibInternal.c
+++ b/MdePkg/Library/BasePrintLib/PrintLibInternal.c
@@ -680,10 +680,20 @@ BasePrintLibSPrintMarker (
if (TmpGuid == NULL) {
ArgumentString = "<null guid>";
} else {
+ UINTN (EFIAPI * volatile PrintFunction) (
+ OUT CHAR8 *StartOfBuffer,
+ IN UINTN BufferSize,
+ IN UINTN Flags,
+ IN CONST CHAR8 *FormatString,
+ ...
+ );
+
+ PrintFunction = BasePrintLibSPrint;
+
GuidData1 = ReadUnaligned32 (&(TmpGuid->Data1));
GuidData2 = ReadUnaligned16 (&(TmpGuid->Data2));
GuidData3 = ReadUnaligned16 (&(TmpGuid->Data3));
- BasePrintLibSPrint (
+ PrintFunction (
ValueBuffer,
MAXIMUM_VALUE_CHARACTERS,
0,
----------------

With this patch, GUIDs are printed okay with -Os.

(Of course it's not edk2 that needs to be fixed.)





But pedantically you need change the definition of BasePrintLibSPrint() to include EFIAPI.



If you look at BasePrintLibSPrintMarker() (and some of the other routines) you will notice a BASE_LIST and a VA_LIST. We had an API that exposed a VA_LIST (well code that was casting to VA_LIST) in the report status code stack and it forced use to use our own made up BASE_LIST concept to get it to work. I think you are going to hit similar issues mixing calling conventions.



So my 1st question is why do you need to mix calling conventions, and depend on EFIAPI for interoperability. Why not just change the ABI on all functions?



If I understand your question "Why not just change the ABI on all functions", you mean use Microsoft ABI throughout the code even when compiled with gcc? The gcc option -mabi=ms makes this easy, and it reduces code size too (8% in one test). Part of that code size reduction is because it removes the requirement to save xmm6-xmm15 when calling msabi code. Gcc doesn't optimize the save/restore of xmm6-xmm15, it just does them all. The problems with ms abi I can think of are:

1) Linux developers accustomed to stepping through the sysv calling convention would have to adapt to the ms calling convention.

2) -mabi=ms is probably little used and therefore more likely to have bugs. This might require dropping support for older gcc tool chains.

3) According to an email from you in April, ms abi might not support stack trace without debug symbols.



Even if ms abi is never made the default for gcc code, adding an environment variable such as EXTRA_CC_FLAGS would allow its use in custom builds by those who need the code size reduction it brings.



What about switching EDK2 to sysv abi? I assume that would require dropping support for Microsoft compilers.



Thanks,

Scott



Problems with the mixed calling convention:

1) All assembly routines must be marked as EFIAPI, or the C code will generate the wrong calling convention. Not an issue in the MdePkg, but potentially an issue in other packages.

2) The var arg chain needs to be constant. I think for i386 you get lucky and the calling conventions are close enough it kind of works, but for X64 they are very different. Even if you force VA_LIST to be the Microsoft one, it is not clear to me that forces the compiler to treat every 
 the Microsoft way.



Thanks,



Andrew Fish
Andrew Fish
2014-11-04 21:40:03 UTC
Permalink
Post by Andrew Fish
But pedantically you need change the definition of BasePrintLibSPrint() to include EFIAPI.
If you look at BasePrintLibSPrintMarker() (and some of the other routines) you will notice a BASE_LIST and a VA_LIST. We had an API that exposed a VA_LIST (well code that was casting to VA_LIST) in the report status code stack and it forced use to use our own made up BASE_LIST concept to get it to work. I think you are going to hit similar issues mixing calling conventions.
So my 1st question is why do you need to mix calling conventions, and depend on EFIAPI for interoperability. Why not just change the ABI on all functions?
1) Linux developers accustomed to stepping through the sysv calling convention would have to adapt to the ms calling convention.
Well all the public interfaces (EFI service, protocol member functions, etc.) are EFIAPI, so there will be a mix.
Post by Andrew Fish
2) -mabi=ms is probably little used and therefore more likely to have bugs. This might require dropping support for older gcc tool chains.
Looks like mixing has bugs too. It looks like this issue is caused by a mismatch in the vararg definitions between the two worlds. You can’t really mix styles in a given call stack. It almost seems like you want to force one var arg scheme every place possible.
Post by Andrew Fish
3) According to an email from you in April, ms abi might not support stack trace without debug symbols.
This is not the ABI it is VC++ code generation. There is nothing in the ABI about how to unwind a stack frame, it is about how to call code in C.

In my clang examples, in this thread, we have an EFIAPI compatible calling convention with stack unwind. -target x86_64-pc-win32-macho means build an X64 image using EFIAPI, do the standard frame pointer, and we kick out a Mach-O executable for the debugger. We convert the Mach-O to PE/COFF for EFI compatibility. So on clang it as EFIABI, but with stack unwind. We can always unwind a frame without symbols, until we hit code compiled with VC++.
Post by Andrew Fish
Even if ms abi is never made the default for gcc code, adding an environment variable such as EXTRA_CC_FLAGS would allow its use in custom builds by those who need the code size reduction it brings.
What about switching EDK2 to sysv abi? I assume that would require dropping support for Microsoft compilers.
The EFI calling convention is in the spec, so all things EFI would break. Option ROMs on cards, installed Operating system, etc
. The edk2 is an open source project that implements industry standard, not just a chunk of code.

Thanks,

Andrew Fish
Post by Andrew Fish
Thanks,
Scott
Scott Duplichan
2014-11-07 16:16:23 UTC
Permalink
Andrew Fish [mailto:***@apple.com] wrote:


Sent: Tuesday, November 04, 2014 03:40 PM
To: edk2-***@lists.sourceforge.net
Cc: Paolo Bonzini
Subject: Re: [edk2] Enable optimization for gcc x64 builds





On Nov 4, 2014, at 12:15 PM, Scott Duplichan <***@notabs.org <mailto:***@notabs.org> > wrote:



But pedantically you need change the definition of BasePrintLibSPrint() to include EFIAPI.



If you look at BasePrintLibSPrintMarker() (and some of the other routines) you will notice a BASE_LIST and a VA_LIST. We had an API that exposed a VA_LIST (well code that was casting to VA_LIST) in the report status code stack and it forced use to use our own made up BASE_LIST concept to get it to work. I think you are going to hit similar issues mixing calling conventions.



So my 1st question is why do you need to mix calling conventions, and depend on EFIAPI for interoperability. Why not just change the ABI on all functions?



If I understand your question "Why not just change the ABI on all functions", you mean use Microsoft ABI throughout the code even when compiled with gcc? The gcc option -mabi=ms makes this easy, and it reduces code size too (8% in one test). Part of that code size reduction is because it removes the requirement to save xmm6-xmm15 when calling msabi code. Gcc doesn't optimize the save/restore of xmm6-xmm15, it just does them all. The problems with ms abi I can think of are:

1) Linux developers accustomed to stepping through the sysv calling convention would have to adapt to the ms calling convention.



Well all the public interfaces (EFI service, protocol member functions, etc.) are EFIAPI, so there will be a mix.





2) -mabi=ms is probably little used and therefore more likely to have bugs. This might require dropping support for older gcc tool chains.



Looks like mixing has bugs too. It looks like this issue is caused by a mismatch in the vararg definitions between the two worlds. You can’t really mix styles in a given call stack. It almost seems like you want to force one var arg scheme every place possible.





3) According to an email from you in April, ms abi might not support stack trace without debug symbols.





This is not the ABI it is VC++ code generation. There is nothing in the ABI about how to unwind a stack frame, it is about how to call code in C.



In my clang examples, in this thread, we have an EFIAPI compatible calling convention with stack unwind. -target x86_64-pc-win32-macho means build an X64 image using EFIAPI, do the standard frame pointer, and we kick out a Mach-O executable for the debugger. We convert the Mach-O to PE/COFF for EFI compatibility. So on clang it as EFIABI, but with stack unwind. We can always unwind a frame without symbols, until we hit code compiled with VC++.





Even if ms abi is never made the default for gcc code, adding an environment variable such as EXTRA_CC_FLAGS would allow its use in custom builds by those who need the code size reduction it brings.



What about switching EDK2 to sysv abi? I assume that would require dropping support for Microsoft compilers.





The EFI calling convention is in the spec, so all things EFI would break. Option ROMs on cards, installed Operating system, etc
. The edk2 is an open source project that implements industry standard, not just a chunk of code.



Thanks,



Andrew Fish



These are all good answers. I can't come up with a strong argument for the mixed sysv/ms ABI. Maybe the next step is to test -mabi=ms using several gcc versions (I think -mabi=ms was introduced with gcc 4.5). If that works, I could submit a patch and see what happens..

Thanks,

Scott







Thanks,

Scott
Jordan Justen
2014-11-07 18:39:34 UTC
Permalink
Post by Scott Duplichan
These are all good answers. I can't come up with a strong argument for the
mixed sysv/ms ABI. Maybe the next step is to test -mabi=ms using several gcc
versions (I think -mabi=ms was introduced with gcc 4.5). If that works, I could
submit a patch and see what happens..
I mentioned a reason in this thread a few days back. But, we should
look into -mabi=ms for RELEASE builds.

-Jordan

------------------------------------------------------------------------------
Scott Duplichan
2014-11-07 20:29:45 UTC
Permalink
Jordan Justen [mailto:***@intel.com] wrote:

]On 2014-11-07 08:16:23, Scott Duplichan wrote:
]> These are all good answers. I can't come up with a strong argument for the
]> mixed sysv/ms ABI. Maybe the next step is to test -mabi=ms using several gcc
]> versions (I think -mabi=ms was introduced with gcc 4.5). If that works, I could
]> submit a patch and see what happens..
]
]I mentioned a reason in this thread a few days back. But, we should
]look into -mabi=ms for RELEASE builds.
]
]-Jordan

I agree, the approach in your previous email is a good one. Prototyping
asm functions to enforce calling convention is always a good idea. In theory
an IA32 build could be done with a Microsoft compiler with option /Gr
(__fastcall calling convention) and it would work. This would not be possible
if asm function calling convention information were missing. If I make this
patch, I will add the gcc -mabi=ms to the release build.

Now for rants...
1) Why do so many developers never want to test release builds? To me, code
is not clean until both debug and release builds work smoothly.
2) Why is the NOOPT build missing from virtually every DSC file in EDK2?
The EDK NOOPT build is most like what developers call a DEBUG build. It is
the only one setup for source level debugging, at least for Microsoft tool
chains. The Duet DSC files are missing both RELEASE and NOOPT options. I
may submit a patch to allow all 3 builds to every DSC file.

Thanks,
Scott


------------------------------------------------------------------------------
Andrew Fish
2014-11-07 21:56:05 UTC
Permalink
Post by Scott Duplichan
Now for rants...
1) Why do so many developers never want to test release builds? To me, code
is not clean until both debug and release builds work smoothly.
2) Why is the NOOPT build missing from virtually every DSC file in EDK2?
The EDK NOOPT build is most like what developers call a DEBUG build. It is
the only one setup for source level debugging, at least for Microsoft tool
chains. The Duet DSC files are missing both RELEASE and NOOPT options. I
may submit a patch to allow all 3 builds to every DSC file.
Scott,

Historically, if I remember correctly, the NOOPT builds were added to support Nt32Pkg/EmulatorPkg.

“Back in the day” a NOOPT build would not generally fit in a ROM. But given folks could be building an option ROM, shell, OS loader that don’t have size constraints I think you are right in pointing out the NOOPT builds should be in all the supported tool chains.

Thanks,

Andrew Fish
Post by Scott Duplichan
Thanks,
Scott
------------------------------------------------------------------------------
Laszlo Ersek
2014-11-11 09:33:28 UTC
Permalink
Post by Scott Duplichan
]> These are all good answers. I can't come up with a strong argument for the
]> mixed sysv/ms ABI. Maybe the next step is to test -mabi=ms using several gcc
]> versions (I think -mabi=ms was introduced with gcc 4.5). If that works, I could
]> submit a patch and see what happens..
]
]I mentioned a reason in this thread a few days back. But, we should
]look into -mabi=ms for RELEASE builds.
]
]-Jordan
I agree, the approach in your previous email is a good one. Prototyping
asm functions to enforce calling convention is always a good idea. In theory
an IA32 build could be done with a Microsoft compiler with option /Gr
(__fastcall calling convention) and it would work. This would not be possible
if asm function calling convention information were missing. If I make this
patch, I will add the gcc -mabi=ms to the release build.
Now for rants...
1) Why do so many developers never want to test release builds? To me, code
is not clean until both debug and release builds work smoothly.
In my experience, release (== optimized) builds are practically
unsupportable. Even the Linux kernel disables some optimizations that
make the disassembly unreadable. Unless stuff is power and/or
performance critical, I prefer if the code does exactly what I tell it
to do. (Case in point: the -Os bug with recursion + ellipsis. It works
with -O0. Compilers have bugs.)

*All* software is chock full of bugs, and having to figure out what goes
wrong at a customer's site is a question of "when", not "if". They
either won't be able, or willing, to attempt to reproduce the issue with
a debug build, or they will try and the bug might disappear.

Consequently, since I'm not keen on shipping anything but a debug build,
I don't feel like putting many resources into release builds.
Post by Scott Duplichan
2) Why is the NOOPT build missing from virtually every DSC file in EDK2?
I guess in OVMF we never needed it?
Post by Scott Duplichan
The EDK NOOPT build is most like what developers call a DEBUG build. It is
the only one setup for source level debugging, at least for Microsoft tool
chains.
I don't use Microsoft tool chains. And source level debugging with gdb
is hardly possible even on DEBUG; you have to jump through incredible
hoops. NOOPT doesn't solve anything for Linux-based developers & users
of OVMF.
Post by Scott Duplichan
The Duet DSC files are missing both RELEASE and NOOPT options.
It's an emulator platform, isn't it? You probably won't use it in
production.

Thanks
Laszlo
Scott Duplichan
2014-11-11 17:46:20 UTC
Permalink
Laszlo Ersek [mailto:***@redhat.com] wrote:

]On 11/07/14 21:29, Scott Duplichan wrote:
]> Jordan Justen [mailto:***@intel.com] wrote:
]>
]> ]On 2014-11-07 08:16:23, Scott Duplichan wrote:
]> ]> These are all good answers. I can't come up with a strong argument for the
]> ]> mixed sysv/ms ABI. Maybe the next step is to test -mabi=ms using several gcc
]> ]> versions (I think -mabi=ms was introduced with gcc 4.5). If that works, I could
]> ]> submit a patch and see what happens..
]> ]
]> ]I mentioned a reason in this thread a few days back. But, we should
]> ]look into -mabi=ms for RELEASE builds.
]> ]
]> ]-Jordan
]>
]> I agree, the approach in your previous email is a good one. Prototyping
]> asm functions to enforce calling convention is always a good idea. In theory
]> an IA32 build could be done with a Microsoft compiler with option /Gr
]> (__fastcall calling convention) and it would work. This would not be possible
]> if asm function calling convention information were missing. If I make this
]> patch, I will add the gcc -mabi=ms to the release build.
]>
]> Now for rants...
]> 1) Why do so many developers never want to test release builds? To me, code
]> is not clean until both debug and release builds work smoothly.
]
]In my experience, release (== optimized) builds are practically
]unsupportable. Even the Linux kernel disables some optimizations that
]make the disassembly unreadable. Unless stuff is power and/or
]performance critical, I prefer if the code does exactly what I tell it
]to do. (Case in point: the -Os bug with recursion + ellipsis. It works
]with -O0. Compilers have bugs.)
]
]*All* software is chock full of bugs, and having to figure out what goes
]wrong at a customer's site is a question of "when", not "if". They
]either won't be able, or willing, to attempt to reproduce the issue with
]a debug build, or they will try and the bug might disappear.
]
]Consequently, since I'm not keen on shipping anything but a debug build,
]I don't feel like putting many resources into release builds.

Release builds are/were shipped out of necessity, at least in the past.
This was due to a desire to cut board cost by using the smallest possible
flash chip. But times are changing and NOR flash capacity is growing
even faster than code size. So the flash size reduction motivation for
optimizing code is losing importance I guess. In my experience, getting
a release build to work sometimes results in uncovering hidden coding
errors. A bigger reason to use release builds is boot time reduction.
While UEFI will never boot as fast as coreboot, it can narrow the gap
some by minimizing the time spent reading data from the flash chip.
Adding -Os and link time optimization can cut the image size in half.
That saves significant time when the image is read from the flash chip.
]
]> 2) Why is the NOOPT build missing from virtually every DSC file in EDK2?
]
]I guess in OVMF we never needed it?

I got OVMF booting on a real server a few years ago. Adding the NOOPT
build was the first thing I did. That let me step through the source code
and see all local variables. I couldn't have gotten it working as quickly
as I did without source level debugging.

Instead of adding NOOPT to every project, adding it to one or two might
be a better idea. I don't want to see a lot of code change just for the
sake debugger support.

]> The EDK NOOPT build is most like what developers call a DEBUG build. It is
]> the only one setup for source level debugging, at least for Microsoft tool
]> chains.
]
]I don't use Microsoft tool chains. And source level debugging with gdb
]is hardly possible even on DEBUG; you have to jump through incredible
]hoops. NOOPT doesn't solve anything for Linux-based developers & users
]of OVMF.

I understand. But Microsoft tool chains are still supported by the EDK2
project. Dropping support for Microsoft tool chains would solve some
problems. But clearly that isn't going to happen any time soon. It is
surprising to see the strength and weaknesses of different tool chains.
Microsoft perfected link time optimization 10+ years ago. Yet they
didn't even bother with C99 support until recently. GCC had C99 10+
years ago, yet only recently perfected link time optimization.

It is unfortunate there is no nice open source debugger for use with
EDK2 and other embedded projects. Some of the OEM and IBV debuggers
I used were really nice, though at the time they supported only
Microsoft debug symbols.

]> The Duet DSC files are missing both RELEASE and NOOPT options.
]
]It's an emulator platform, isn't it? You probably won't use it in
]production.

Of all EDK2 projects, Duet seems to need the fewest changes for use
on real hardware. You would be surprised to know how many Duet based
systems have shipped from tier one oems.

]Thanks
]Laszlo

------------------------------------------------------------------------------
Comprehensive Server Monitoring with Site24x7.
Monitor 10 servers for $9/Month.
Get alerted through email, SMS, voice calls or mobile push notifications.
Take corrective actions from your mobile device.
http://pubads.g.doubleclick.net/gampad/clk?id=154624111&iu=/4140/ostg.clktrk
_______________________________________________
edk2-devel mailing list
edk2-***@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/edk2-devel
Jordan Justen
2014-11-04 22:32:57 UTC
Permalink
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have
set up for EDK II. It just so happens that the GCC4X toolchains are
the only ones that use EFIAPI, and thus are the only ones that allow
us to keep our codebase clean with regards to EFIAPI.

For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.

-Jordan

------------------------------------------------------------------------------
Tim Lewis
2014-11-04 22:37:40 UTC
Permalink
Another note (from the archives): vendors used mixed builds for VS2003/VS2005 on 32-bit in order to use __fastcall for internal function calls and then EFIABI for all the various UEFI calls.

Tim

-----Original Message-----
From: Jordan Justen [mailto:***@gmail.com]
Sent: Tuesday, November 04, 2014 2:33 PM
To: Andrew J. Fish
Cc: Paolo Bonzini; edk2-***@lists.sourceforge.net
Subject: Re: [edk2] Enable optimization for gcc x64 builds
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and
depend on EFIAPI for interoperability. Why not just change the ABI on
all functions?
GCC 4.4 doesn't support the command line option to change everything over. So, EFIAPI was the only option then.
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have set up for EDK II. It just so happens that the GCC4X toolchains are the only ones that use EFIAPI, and thus are the only ones that allow us to keep our codebase clean with regards to EFIAPI.

For GCC >= 4.5, I actually think we should convert *RELEASE* builds over to using the ms-abi all the time to generate smaller code. I think we should leave DEBUG builds as mixed to help clean up EFIAPI issues.

-Jordan
Andrew Fish
2014-11-04 23:02:48 UTC
Permalink
Post by Jordan Justen
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have
set up for EDK II. It just so happens that the GCC4X toolchains are
the only ones that use EFIAPI, and thus are the only ones that allow
us to keep our codebase clean with regards to EFIAPI.
I agree it is good in keeping the edk2 code clean. I was more concerned about code from 3rd parties.
So sorry this was more about a warning when you are porting code on what to look out for.

I’m really only concerned about how the VA_LIST stuff is going to work? Does it need to shift for native vs. EFIAPI? If so how you pass the VA_LIST around if the code is not all the same ABI?
Post by Jordan Justen
For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.
You guys should figure out if you can have a ms-abi but add the frame pointers. The compiler is open source ….

Thanks,

Andrew Fish
Post by Jordan Justen
-Jordan
------------------------------------------------------------------------------
Laszlo Ersek
2014-11-05 10:00:20 UTC
Permalink
Post by Andrew Fish
Post by Jordan Justen
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have
set up for EDK II. It just so happens that the GCC4X toolchains are
the only ones that use EFIAPI, and thus are the only ones that allow
us to keep our codebase clean with regards to EFIAPI.
I agree it is good in keeping the edk2 code clean. I was more concerned about code from 3rd parties.
So sorry this was more about a warning when you are porting code on what to look out for.
I’m really only concerned about how the VA_LIST stuff is going to
work? Does it need to shift for native vs. EFIAPI? If so how you pass
the VA_LIST around if the code is not all the same ABI?
We need to distinguish passing arguments through the ellipsis (...) from
passing VA_LIST (as a normal, named parameter).

For passing arguments through the ellipsis (...), the called function
*must* be EFIAPI (in the current state of the tree). Otherwise
VA_START() won't work in the callee.

(BTW I have no problem with the above "restriction".)

Regarding passing VA_LIST by name (which is identical to passing a
simple CHAR8* by name) -- it already works fine, regardless of EFIAPI
vs. no-EFIAPI.

The problem discussed in this thread is unrelated to EFIAPI. The problem
is (apparently) that gcc's -Os optimization corrupts a local variable in
a chain of recursive calls.
Post by Andrew Fish
Post by Jordan Justen
For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.
You guys should figure out if you can have a ms-abi but add the frame pointers. The compiler is open source ….
In my experimentation yesterday, one of the first things I tried (on
"gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-16)") was
'-fno-omit-frame-pointer'.

Because, '-Os' implies '-fomit-frame-pointer', and at that point I
thought that maybe '-fomit-frame-pointer', incurred by '-Os', was
causing the issue.

So, I added '-fno-omit-frame-pointer' *after* -Os.

It triggered a "sorry, unimplemented" bug, which was very similar to
<https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59927>:

sorry, unimplemented: ms_abi attribute requires
-maccumulate-outgoing-args or subtarget optimization implying it

However, after appending '-maccumulate-outgoing-args' as well, the build
resumed. (To clarify, this meant:

-Os -fno-omit-frame-pointer -maccumulate-outgoing-args

.) Unfortunately, although the tree did build like this, the original
issue persisted. Which I took as proof that the bug was unrelated to
reserving or not reserving %rbp as frame pointer.

I wish I could write a small reproducer for this problem...
Post by Andrew Fish
Thanks,
Andrew Fish
Post by Jordan Justen
-Jordan
------------------------------------------------------------------------------
_______________________________________________
edk2-devel mailing list
https://lists.sourceforge.net/lists/listinfo/edk2-devel
------------------------------------------------------------------------------
Laszlo Ersek
2014-11-05 12:29:11 UTC
Permalink
Post by Laszlo Ersek
I wish I could write a small reproducer for this problem...
I wrote such a reproducer. I'll post it as a separate patch, just for
discussion. If we all agree that the code should work, then I could turn
it into a gcc bug report.

Thanks!
Laszlo


------------------------------------------------------------------------------
Andrew Fish
2014-11-05 16:40:55 UTC
Permalink
Post by Laszlo Ersek
Post by Andrew Fish
Post by Jordan Justen
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have
set up for EDK II. It just so happens that the GCC4X toolchains are
the only ones that use EFIAPI, and thus are the only ones that allow
us to keep our codebase clean with regards to EFIAPI.
I agree it is good in keeping the edk2 code clean. I was more concerned about code from 3rd parties.
So sorry this was more about a warning when you are porting code on what to look out for.
I’m really only concerned about how the VA_LIST stuff is going to
work? Does it need to shift for native vs. EFIAPI? If so how you pass
the VA_LIST around if the code is not all the same ABI?
We need to distinguish passing arguments through the ellipsis (...) from
passing VA_LIST (as a normal, named parameter).
For passing arguments through the ellipsis (...), the called function
*must* be EFIAPI (in the current state of the tree). Otherwise
VA_START() won't work in the callee.
(BTW I have no problem with the above "restriction".)
Regarding passing VA_LIST by name (which is identical to passing a
simple CHAR8* by name) -- it already works fine, regardless of EFIAPI
vs. no-EFIAPI.
OK thanks for the info. For some flavors of GCC the __buildin_va_list is a structure. Since the size of the structure is > 8 bytes it is passed via a pointer per the calling conventions. For X64 EFIAPI VA_LIST is a pointer to the frame where the register based arguments have have been spilled.

For clang x86_64 __buildin_va_list is also a structure, so you can’t mix and match.

Thanks,

Andrew Fish

~/work/Compiler>cat vv.c
#include <stdarg.h>
#include <stdio.h>

int
main ()
{
printf ("sizeof __builtin_va_list %lu\n", sizeof (__builtin_va_list));
return 0;
}
~/work/Compiler>clang vv.c
~/work/Compiler>./a.out
sizeof __builtin_va_list 24
~/work/Compiler>clang -arch i386 vv.c
~/work/Compiler>./a.out
sizeof __builtin_va_list 4
~/work/Compiler>
Post by Laszlo Ersek
The problem discussed in this thread is unrelated to EFIAPI. The problem
is (apparently) that gcc's -Os optimization corrupts a local variable in
a chain of recursive calls.
Post by Andrew Fish
Post by Jordan Justen
For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.
You guys should figure out if you can have a ms-abi but add the frame pointers. The compiler is open source 
.
In my experimentation yesterday, one of the first things I tried (on
"gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-16)") was
'-fno-omit-frame-pointer'.
Because, '-Os' implies '-fomit-frame-pointer', and at that point I
thought that maybe '-fomit-frame-pointer', incurred by '-Os', was
causing the issue.
So, I added '-fno-omit-frame-pointer' *after* -Os.
It triggered a "sorry, unimplemented" bug, which was very similar to
sorry, unimplemented: ms_abi attribute requires
-maccumulate-outgoing-args or subtarget optimization implying it
However, after appending '-maccumulate-outgoing-args' as well, the build
-Os -fno-omit-frame-pointer -maccumulate-outgoing-args
.) Unfortunately, although the tree did build like this, the original
issue persisted. Which I took as proof that the bug was unrelated to
reserving or not reserving %rbp as frame pointer.
I wish I could write a small reproducer for this problem...
David Woodhouse
2015-07-23 09:46:15 UTC
Permalink
Post by Jordan Justen
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=39472#c8 suggests that the
support was backported to GCC 4.4 too.
Post by Jordan Justen
Post by Andrew Fish
1) All assembly routines must be marked as EFIAPI, or the C code will
generate the wrong calling convention. Not an issue in the MdePkg, but
potentially an issue in other packages.
I don't see this as a problem. I think this is the rules that we have
set up for EDK II. It just so happens that the GCC4X toolchains are
the only ones that use EFIAPI, and thus are the only ones that allow
us to keep our codebase clean with regards to EFIAPI.
For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.
Is it feasible to just kill EFIAPI completely, if GCC4X toolchains are
the only ones that use it and we can move them to -mabi=ms?

Tim pointed out that vendors use it on 32-bit targets to use __fastcall
for internal functions. If that's a worthwhile optimisation it might
make sense to merge that properly — perhaps using an annotation for the
*internal* functions where it makes most sense, rather than having to
annotate the EFIAPI functions? But quite frankly, if they're not
contributing optimisations upstream in a timely fashion then we
*really* shouldn't be pandering to them. If that's the only remaining
use case then we should let it die.

If we *can't* kill EFIAPI completely, then we need to get GCC's
__builtin_va_list to do the right thing according to the ABI of the
function it happens to be compiling at the time. This is
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50818
--
David Woodhouse Open Source Technology Centre
***@intel.com Intel Corporation
Paolo Bonzini
2015-07-23 10:04:06 UTC
Permalink
Post by David Woodhouse
Post by Jordan Justen
Post by Andrew Fish
So my 1st question is why do you need to mix calling conventions, and depend
on EFIAPI for interoperability. Why not just change the ABI on all
functions?
GCC 4.4 doesn't support the command line option to change everything
over. So, EFIAPI was the only option then.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=39472#c8 suggests that the
support was backported to GCC 4.4 too.
"ix86/gcc-4_4-branch" sounds like an internal branch for use by Intel
engineers. Features are not backported to stable branches.
Post by David Woodhouse
Post by Jordan Justen
For GCC >= 4.5, I actually think we should convert *RELEASE* builds
over to using the ms-abi all the time to generate smaller code. I
think we should leave DEBUG builds as mixed to help clean up EFIAPI
issues.
Is it feasible to just kill EFIAPI completely, if GCC4X toolchains are
the only ones that use it and we can move them to -mabi=ms?
Tim pointed out that vendors use it on 32-bit targets to use __fastcall
for internal functions. If that's a worthwhile optimisation it might
make sense to merge that properly — perhaps using an annotation for the
*internal* functions where it makes most sense, rather than having to
annotate the EFIAPI functions? But quite frankly, if they're not
contributing optimisations upstream in a timely fashion then we
*really* shouldn't be pandering to them. If that's the only remaining
use case then we should let it die.
If we *can't* kill EFIAPI completely, then we need to get GCC's
__builtin_va_list to do the right thing according to the ABI of the
function it happens to be compiling at the time. This is
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50818
Am I CCed because you'd like me to fix it? :) I can take a look.

Paolo

------------------------------------------------------------------------------
David Woodhouse
2015-07-23 11:53:49 UTC
Permalink
Post by Paolo Bonzini
Post by David Woodhouse
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=39472#c8 suggests that the
support was backported to GCC 4.4 too.
"ix86/gcc-4_4-branch" sounds like an internal branch for use by Intel
engineers. Features are not backported to stable branches.
Hm, yes. I was misled by that final comment. It doesn't seem to be in
gcc-4_4-branch; you're right.

I don't suppose we can ditch GCC 4.4 support too?

I hesitate slightly because *last* time I said 'here's a nickel, kid.
Get yourself a better compiler' I then ended up spending a month or so
hacking LLVM to add .code16 support... :)
Post by Paolo Bonzini
Post by David Woodhouse
If we *can't* kill EFIAPI completely, then we need to get GCC's
__builtin_va_list to do the right thing according to the ABI of the
function it happens to be compiling at the time. This is
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50818
Am I CCed because you'd like me to fix it? :) I can take a look.
You were Cc'd because I just revived an old thread and you were already
on it. But don't let me discourage you!

My *primary* motivation right now is getting our OpenSSL patches
upstream though, and fixing PR50818 doesn't really help with that in
the short term. But it *would* be nice.
--
dwmw2
Bruce Cran
2015-07-23 16:29:56 UTC
Permalink
Going back to the original description
(http://comments.gmane.org/gmane.comp.bios.tianocore.devel/10741), I
noticed in
https://firmware.intel.com/sites/default/files/MinnowBoard_MAX-Rel_0.81-ReleaseNotes.txt
that Intel say:

2. Because the binary size created using GCC (Linux environment) is ~20% lager than the size of
the binary created using the Microsoft toolchain (Windows Environment), the firmware build
in the Linux environment (GCC build) uses the minimum shell instead of fullshell, The image
built in the Linux environment (GCC build) may have some limitation for UEFI shell application.


I guess that will be because there's no optimization! If this patch gets committed, could it be backported to the UDK2014.SP1 branch so the MinnowboardMAX project can make use of it, or would it be too risky?

By the way, messages are still going through to the edk2-***@lists.sourceforge.net list. Shouldn't it be set to read-only now the 01.org ML is active?
--
Bruce


------------------------------------------------------------------------------
Continue reading on narkive:
Loading...