Explain Inline Function in C (GCC compiler)
在 C++ 剛出現的時候,大多數 C++ compiler 引入了一個新關鍵字 'inline' ,用於解決個體行為 (methods of object) 的部份效率瓶頸。直到 ISO C99 時,才正式將 inline 定於規範之中。不過多數的 C/C++ compiler 只將 inline 用於 C++ program 中,並未延伸到 C program (*1)。'inline' is a keyword of ANSI/ISO C99. Some C/C++ compilers do not allow to use this in C program. '__inline__' is a keyword of GNU GCC. Some compilers use '__inline'. )。 GCC 最晚在 2.9 版 ,為 C program 加入了一個擴充關鍵字 '__inline__' (*2),使得 programmer 可以在 C program 中使用 inline function 。請參考 GNU GCC Info document setcion 'An Inline Function is As Fast As a Macro'. 說明文件意簡言骸,我將配合程式碼說明 inline function 在 C program 中運作的相關細節。
- GCC 為 C program 提供了 __inline__ 擴充關鍵字,有些 compiler 則是用 __inline。
- 印象中,我在 1998-99 年時,便已在 C program 中使用了 __inline__ 關鍵字,見 FBTIP。因此 GCC 是在 ISO C99 規範出現前,先提供了這個擴充關鍵字供 programmer 使用。
Codes of Case
c_inline_case1.h
Case 1, combination of inline and static.
#if !defined( C_INLINE )
#define C_INLINE static __inline__
#endif // defined( C_INLINE )
c_inline_case2.h
Case 2, combination of inline and extern.
#if !defined( C_INLINE )
#define C_INLINE extern __inline__
#endif // defined( C_INLINE )
c_inline_case3.h
Case 3, inline only.
#if !defined( C_INLINE )
#define C_INLINE __inline__
#endif // defined( C_INLINE )
strcase.h
#if !defined( __STRCASE_H )
#define __STRCASE_H
C_INLINE char*lcase(char*s) {
char*c;
if(s) {
c=s;
while(*c) {
if( *c >= 'A' && *c <= 'Z')
*c += '\x20' /* 'A' - 'a' */;
++c;
}
}
return s;
}
C_INLINE char*ucase(char*s) {
char*c;
if(s) {
c=s;
while(*c) {
if( *c >= 'a' && *c <= 'z')
*c -= '\x20';
++c;
}
}
return s;
}
#endif // defined( __STRCASE_H )
case_l.c
Call inline function 'lcase' directly. 第 19 行展示了 inline function 可以回傳值的特性。用 MACRO 的方式,在此無法通過。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#if defined(USE_CASE2)
#include "c_inline_case2.h"
#elif defined(USE_CASE3)
#include "c_inline_case3.h"
#else
#include "c_inline_case1.h"
#endif
#include "strcase.h"
int main()
{
char*hello = (char*)malloc(1024);
strcpy(hello, "Hello World\n");
puts( lcase(hello) );
return 1;
}
case_u.c
Call inline function 'ucase' via pointer of function. 第 17 行展示了 inline function 可以用於函數指標、遞迴的特性。同樣地,用 MACRO 也無法做到。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#if defined(USE_CASE2)
#include "c_inline_case2.h"
#elif defined(USE_CASE3)
#include "c_inline_case3.h"
#else
#include "c_inline_case1.h"
#endif
#include "strcase.h"
int main()
{
char*hello = (char*)malloc(1024);
char*(*func)(char*) = ucase;
strcpy(hello, "Hello World\n");
puts( (*func)(hello) );
return 1;
}
Works only in optimizing compilation
Inlining of functions is an optimization and it really "works" only in optimizing compilation. If you don't use '-O', no function is really inline.
When a function is both inline and static, if all calls to the function are integrated into the caller, and the function's address is never used, then the function's own assembler code is never referenced. In this case, GCC does not actually output assembler code for the function, unless you specify the option `-fkeep-inline-functions'. Some calls cannot be integrated for various reasons (in particular, calls that precede the function's definition cannot be integrated, and neither can recursive calls within the definition). If there is a nonintegrated call, then the function is compiled to assembler code as usual. The function must also be compiled as usual if the program refers to its address, because that can't be inlined.
An Inline Function is As Fast As a Macro
以 '-S' 指示 gcc 將程式碼譯為組合語言碼 (assembler code) ,以便觀察有無啟用最佳化選項 (-O) 的影嚮。
# gcc -DUSE_CASE1 -S -o case_l_1a.asm case_l.c # gcc -DUSE_CASE1 -O -S -o case_l_1b.asm case_l.c # gcc -DUSE_CASE1 -O -S -o case_u_1b.asm case_u.c
case_l_1a.asm
第 39-69 行是 lcase 的 assembler code ,第 32 行顯示此處依然使用 call 調用 lcase ,而非直接展開 (inline) 。這表示在未啟用最佳化選項時, GCC 不會展開 inline function 的內容。
.file "case_l.c"
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "Hello World\12\0"
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl %esp, %ebp
subl $24, %esp
andl $-16, %esp
movl $0, %eax
addl $15, %eax
addl $15, %eax
shrl $4, %eax
sall $4, %eax
movl %eax, -8(%ebp)
movl -8(%ebp), %eax
call __alloca
call ___main
movl $1024, (%esp)
call _malloc
movl %eax, -4(%ebp)
movl $LC0, 4(%esp)
movl -4(%ebp), %eax
movl %eax, (%esp)
call _strcpy
movl -4(%ebp), %eax
movl %eax, (%esp)
call _lcase
movl %eax, (%esp)
call _puts
movl $1, %eax
leave
ret
.def _lcase; .scl 3; .type 32; .endef
_lcase:
pushl %ebp
movl %esp, %ebp
subl $4, %esp
cmpl $0, 8(%ebp)
je L3
movl 8(%ebp), %eax
movl %eax, -4(%ebp)
L4:
movl -4(%ebp), %eax
cmpb $0, (%eax)
je L3
movl -4(%ebp), %eax
cmpb $64, (%eax)
jle L6
movl -4(%ebp), %eax
cmpb $90, (%eax)
jg L6
movl -4(%ebp), %eax
movl -4(%ebp), %edx
movzbl (%edx), %edx
addb $32, %dl
movb %dl, (%eax)
L6:
leal -4(%ebp), %eax
incl (%eax)
jmp L4
L3:
movl 8(%ebp), %eax
leave
ret
.def _puts; .scl 3; .type 32; .endef
.def _strcpy; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef
case_l_1b.asm
第 17-39 行是 lcase 展開後的內容,這表示 GCC 啟用最佳化選項時,才會展開 inline function 。由於同時搭配 static 關鍵字,在沒有需要時, GCC 不會產生 assembler code ,所以在此看不到 lcase 的 assembler code (In this case, GCC does not actually output assembler code for the function
)。
.file "case_l.c"
.def ___main; .scl 2; .type 32; .endef
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl %esp, %ebp
pushl %ebx
subl $4, %esp
andl $-16, %esp
movl $16, %eax
call __alloca
call ___main
movl $1024, (%esp)
call _malloc
movl $1819043144, (%eax)
movl $1867980911, 4(%eax)
movl $174353522, 8(%eax)
movb $0, 12(%eax)
movl %eax, %ebx
testl %eax, %eax
je L2
movl %eax, %edx
cmpb $0, (%eax)
je L2
L6:
movzbl (%edx), %ecx
movb %cl, %al
subb $65, %al
cmpb $25, %al
ja L5
movb %cl, %al
addb $32, %al
movb %al, (%edx)
L5:
incl %edx
cmpb $0, (%edx)
jne L6
L2:
movl %ebx, (%esp)
call _puts
movl $1, %eax
movl -4(%ebp), %ebx
leave
ret
.def _puts; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef
case_u_1b.asm
儘管啟用了最佳化選項,但此例是透過函數指標 (pointer of function) 調用 ucase ,所以第 28-55 行中, GCC 依然產生了 ucase 的 assembler code (The function must also be compiled as usual if the program refers to its address, because that can't be inlined.
)。 GCC 仍然會在可以展開之處展開 inline function ,但在不能展開之處,則用 call 調用。在第 21 行中,便是用 call 調用 ucase 。可以注意到,第 21 行並不是透過原先指定的變數,而是直接取用 ucase 的 address ,這是因為在最佳化情形下, GCC 判斷可以直接取用 ucase 的 address 。
.file "case_u.c"
.def ___main; .scl 2; .type 32; .endef
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
andl $-16, %esp
movl $16, %eax
call __alloca
call ___main
movl $1024, (%esp)
call _malloc
movl $1819043144, (%eax)
movl $1867980911, 4(%eax)
movl $174353522, 8(%eax)
movb $0, 12(%eax)
movl %eax, (%esp)
call _ucase
movl %eax, (%esp)
call _puts
movl $1, %eax
leave
ret
.def _ucase; .scl 3; .type 32; .endef
_ucase:
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %ebx
testl %ebx, %ebx
je L3
movl %ebx, %edx
cmpb $0, (%ebx)
je L3
L7:
movzbl (%edx), %ecx
movb %cl, %al
subb $97, %al
cmpb $25, %al
ja L6
movb %cl, %al
subb $32, %al
movb %al, (%edx)
L6:
incl %edx
cmpb $0, (%edx)
jne L7
L3:
movl %ebx, %eax
popl %ebx
popl %ebp
ret
.def _puts; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef
Combination of inline and extern
If you specify both inline and extern in the function definition, then the definition is used only for inlining. In no case is the function compiled on its own, not even if you refer to its address explicitly. Such an address becomes an external reference, as if you had only declared the function, and had not defined it.
This combination of inline and extern has almost the effect of a macro.
An Inline Function is As Fast As a Macro
當 inline 和 extern 搭配使用時, GCC 完全不產生 inline function 的 assembler code ,即使是無法展開 inline function 的場合,也僅僅試圖以外部符號調用外部的 assembler code 。因此在此例中,雖可通過 compile ,但 linker 會丟出參照符號未定義的錯誤。
# gcc -DUSE_CASE2 -O -o case_u_2 case_u.c ccymaaaa.o(.text+0x40):case_u.c: undefined reference to 'ucase' gcc: ld returned 1 exit status # gcc -DUSE_CASE2 -O -S -o case_u_2.asm case_u.c
case_u_2.asm
此例係透過函數指標調用 ucase ,無法直接展開。故在第 28 行中宣告 ucase 是一個外部符號,第 21 行中用 call 調用。完全不產生 ucase 的 assembler code ,而認為其可透過連結其他 object files 獲得。
.file "case_u.c"
.def ___main; .scl 2; .type 32; .endef
.text
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
andl $-16, %esp
movl $16, %eax
call __alloca
call ___main
movl $1024, (%esp)
call _malloc
movl $1819043144, (%eax)
movl $1867980911, 4(%eax)
movl $174353522, 8(%eax)
movb $0, 12(%eax)
movl %eax, (%esp)
call _ucase
movl %eax, (%esp)
call _puts
movl $1, %eax
leave
ret
.def _puts; .scl 3; .type 32; .endef
.def _ucase; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef
Only inline
When an inline function is not static, then the compiler must assume that there may be calls from other source files; since a global symbol can be defined only once in any program, the function must not be defined in the other source files, so the calls therein cannot be integrated. Therefore, a non-static inline function is always compiled on its own in the usual fashion.
An Inline Function is As Fast As a Macro
單獨使用 inline 時, GCC 一定會產生 inline function 的 assembler code ,並宣告為全域符號 (global symbol) 。在此例中,其立即而明顯的問題在於,每一個引入 strcase.h 的 C source files ,其 object file 中都會包含相同的全域符號 'lcase' 和 'ucase' ,在連結多個 object files 時, linker 會丟出符號定義重覆的錯誤。
# gcc -DUSE_CASE3 -O -S -o case_l_3.asm case_l.c
case_l_3.asm
第 5-32 行是 lcase 的 assembler code ,第 3 行宣告 'lcase' 為全域符號 (global symbol)。第 35-62 行是 ucase 的 assembler code ,第 33 行宣告 'ucase' 為全域符號 (global symbol)。第 77-99 行處,由於可以展開 inline function ,故展開之。
.file "case_l.c"
.text
.globl _lcase
.def _lcase; .scl 2; .type 32; .endef
_lcase:
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %ebx
testl %ebx, %ebx
je L2
movl %ebx, %edx
cmpb $0, (%ebx)
je L2
L6:
movzbl (%edx), %ecx
movb %cl, %al
subb $65, %al
cmpb $25, %al
ja L5
movb %cl, %al
addb $32, %al
movb %al, (%edx)
L5:
incl %edx
cmpb $0, (%edx)
jne L6
L2:
movl %ebx, %eax
popl %ebx
popl %ebp
ret
.globl _ucase
.def _ucase; .scl 2; .type 32; .endef
_ucase:
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl 8(%ebp), %ebx
testl %ebx, %ebx
je L9
movl %ebx, %edx
cmpb $0, (%ebx)
je L9
L13:
movzbl (%edx), %ecx
movb %cl, %al
subb $97, %al
cmpb $25, %al
ja L12
movb %cl, %al
subb $32, %al
movb %al, (%edx)
L12:
incl %edx
cmpb $0, (%edx)
jne L13
L9:
movl %ebx, %eax
popl %ebx
popl %ebp
ret
.def ___main; .scl 2; .type 32; .endef
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl %esp, %ebp
pushl %ebx
subl $4, %esp
andl $-16, %esp
movl $16, %eax
call __alloca
call ___main
movl $1024, (%esp)
call _malloc
movl $1819043144, (%eax)
movl $1867980911, 4(%eax)
movl $174353522, 8(%eax)
movb $0, 12(%eax)
movl %eax, %ebx
testl %eax, %eax
je L16
movl %eax, %edx
cmpb $0, (%eax)
je L16
L20:
movzbl (%edx), %ecx
movb %cl, %al
subb $65, %al
cmpb $25, %al
ja L19
movb %cl, %al
addb $32, %al
movb %al, (%edx)
L19:
incl %edx
cmpb $0, (%edx)
jne L20
L16:
movl %ebx, (%esp)
call _puts
movl $1, %eax
movl -4(%ebp), %ebx
leave
ret
.def _puts; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef
樂多舊回應