diff --git a/assets/3_1stackframe.png b/assets/3_1stackframe.png new file mode 100644 index 0000000..470cd91 Binary files /dev/null and b/assets/3_1stackframe.png differ diff --git a/notes/3.md b/notes/3.md index 9d53785..84801ac 100644 --- a/notes/3.md +++ b/notes/3.md @@ -52,7 +52,7 @@ void sumstore(long x, long y, long *dest) { ``` ```sh {cmd hide} -while ![ -r 3_1.o ]; do sleep .1; done; objdump -d 3_1.o +while ! [ -r 3_1.o ]; do sleep .1; done; objdump -d 3_1.o ``` ### Integer Registers @@ -267,7 +267,7 @@ long absdiff(long x, long y) { ``` ```sh { cmd hide } -while ![ -r 3_3.o ]; do sleep .1; done; objdump -d 3_3.o -Msuffix +while ! [ -r 3_3.o ]; do sleep .1; done; objdump -d 3_3.o -Msuffix ``` **expressing with `goto`** @@ -300,7 +300,7 @@ long absdiff(long x, long y) { ``` ```sh {cmd hide} -while ![ -r 3_5.o ]; do sleep .1; done; objdump -d 3_5.o -Msuffix +while ! [ -r 3_5.o ]; do sleep .1; done; objdump -d 3_5.o -Msuffix ``` However, there are several *bad cases* for conditional move. @@ -357,7 +357,7 @@ loop: ```sh {cmd hide} -while ![ -r 3_6.o ]; do sleep .1; done; objdump -d 3_6.o -Msuffix +while ! [ -r 3_6.o ]; do sleep .1; done; objdump -d 3_6.o -Msuffix ``` **general do-while translation** @@ -426,7 +426,7 @@ long pcount_while(unsigned long x) { ``` ```sh {cmd hide} echo "jmp-to-middle translation" -while ![ -r 3_7.o ]; do sleep .1; done; objdump -d 3_7.o -Msuffix +while ! [ -r 3_7.o ]; do sleep .1; done; objdump -d 3_7.o -Msuffix ``` **general while translation#2** @@ -478,7 +478,7 @@ long pcount_while(unsigned long x) { ``` ```sh {cmd hide} echo "while to do-while conversion" -while ![ -r 3_8.o ]; do sleep .1; done; objdump -d 3_8.o -Msuffix +while ! [ -r 3_8.o ]; do sleep .1; done; objdump -d 3_8.o -Msuffix ``` #### for loop form @@ -560,13 +560,13 @@ long pcount_for(unsigned long x) { ```sh {cmd hide} -while ![ -r 3_9.o ]; do sleep .1; done; objdump -d 3_9.o -Msuffix +while ! [ -r 3_9.o ]; do sleep .1; done; objdump -d 3_9.o -Msuffix ``` ```sh {cmd hide} -while ![ -r 3_10.o ]; do sleep .1; done; objdump -d 3_10.o -Msuffix +while ! [ -r 3_10.o ]; do sleep .1; done; objdump -d 3_10.o -Msuffix ``` @@ -614,7 +614,7 @@ long switch_eg (long x, long y, long z) { ```sh {cmd hide} -while ![ -r 3_11.s ]; do sleep .1; done; cat 3_11.s +while ! [ -r 3_11.s ]; do sleep .1; done; cat 3_11.s ``` @@ -667,7 +667,7 @@ void multstore(long x, long y, long *dest) { ``` ```sh {cmd hide} -while ![ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix +while ! [ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix ``` Procedure call `call label` @@ -687,7 +687,7 @@ Procedure return: `ret` for example with above example ```sh {cmd hide} -while ![ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix +while ! [ -r 3_12.o ]; do sleep .1; done; objdump -d 3_12.o -Msuffix ``` * with above `mult2` variable `t` is already stored in `%rax` @@ -718,6 +718,38 @@ Deallocated when return, "finish" code and includes pop by `ret`. #### x86-64/Linux Stack Frame +![stack frame image](/assets/3_1stackframe.png) + +* Arguments +* Local variables +* Old `rbp` + +### Register Saving Conventions + +When calling function, the temporary value of registers could be removed by called function, it could be trouble. So there are **conventions** to save the registers value. + +When procedure `yoo` calls `who`: `yoo` is `caller`, `who` is `callee` +* Caller saves temporary values in its frame before the call. +* Callee saves saves temporary values in its frame before using and restores them before returning to caller. + + +#### x86-64 Linux Register Usage + +`%rbx`, `%r12`, `%r13`, `%r14`, `%r15` +* Callee-saved +* Callee must save & restore + +`%rbp` +* Callee-saved +* Callee must save & restore +* May be used as frame pointer by callee +* Can mix & match + +`%rsp` +* Special form of callee-saved +* Restored to original value upon exit from procedure + +#### EX * for compile w/o *stack canary*, add option `-fno-stack-protector` ```c {cmd=gcc args=[-Og -x c -fno-stack-protector -c $input_file -o 3_13.o]} @@ -735,5 +767,27 @@ long call_incr() { ``` ```sh {cmd hide} -while ![ -r 3_13.o ]; do sleep .1; done; objdump -d 3_13.o -Msuffix -``` \ No newline at end of file +while ! [ -r 3_13.o ]; do sleep .1; done; objdump -d 3_13.o -Msuffix +``` + +### Recursive Function + +```c {cmd=gcc args=[-O1 -x c -fno-stack-protector -c $input_file -o 3_14.o]} +long pcount_r(unsigned long x) { + if (x == 0) { + return 0; + } else { + return (x & 1) + pcount_r(x >> 1); + } +} +``` + +```sh {cmd hide} +while ! [ -r 3_14.o ]; do sleep .1; done; objdump -d 3_14.o -Msuffix +``` + +Recursion is not a special function. +* Stack frames mean that each function call has private storage. +* Register saving conventions prevent one function call from corrupting another's data. *unless the explictly corrupting like buffer overflow* +* Stack discipline follows call/return pattern LIFO + diff --git a/notes/4.md b/notes/4.md index 2992031..05105c2 100644 --- a/notes/4.md +++ b/notes/4.md @@ -1,6 +1,284 @@ -# Machine Level Programming +# Optimization -아키텍쳐(ISA) -* intel(x86): CISC -* ARM(aarch64, aarch32): RISC +There's more to performance than asymptotic complexity(time complexity). + +But all the instructions are not consume the same amount of time. Constant factors matter too! So we need to understand system to optimize performance. +* How programs are compiled and executed +* How modern processors and memory system operate +* How to measure performance and identify bottlenecks +* How to improve performance without destroying code modularity and generality + +Provide efficent mapping of program to machine code +* Register allocation +* Code selection and ordering (scheduling) +* Dead code elimination +* Elimininating minor inefficiencies + +**Don't improve asymptotic efficiency**. + +## Generally Useful Optimizations + +### Code Motion(Hoisting) + +Reduce frequecy where computation performed. If it will always produce the same result, then move it to a place where it is computed once and reused. +Especially moving code out of loop. + +```c {cmd=gcc args=[-Og -x c -c $input_file -o 4_1.o]} +void set_row(double *a, double *b, long i, long n) { + long j; + for (j = 0; j < n; j++) { + a[i * n + j] = b[j]; + } +} +``` + + + + + + + + +
DefaultOptimized
+ +```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_2.o]} +void set_row(double *a, double *b, long i, long n) { + long j; + for (j = 0; j < n; j++) { + a[i * n + j] = b[j]; + } +} +``` + + +```c +void set_row_opt(double *a, double *b, long i, long n) { + long j; + int ni = n * i; + for (j = 0; j < n; j++) { + a[ni + j] = b[j]; + } +} +``` +
+ +```sh {cmd hide} +while ! [ -r 4_1.o ]; do sleep .1; done; objdump -d 4_1.o +``` +`imul` is located in the loop. + + +```sh {cmd hide} +while ! [ -r 4_2.o ]; do sleep .1; done; objdump -d 4_2.o +``` +can see that `imul` is located out of the loop. +
+ +Above two codes have same number of instructions. But optimized version has **fewer executed instructions**. + +GCC will do this with `-O1` options + +### Reduction in Strength + +Replace costly operation with simpler one. + +for example: power of 2 multiply to shift operation. normally, multiply and divide are expensive exmaple. on Intel Nehalem, `imul` requires 3 CPU cylcles on the other hand, `add` requires 1 cycle. + + + + +
DefaultOptimized
+ +```c +void test_reduction(double *a, double *b, long i, long n) { + int i, j; + for (i = 0;i < n; i++) { + int ni = n * i; + for (j = 0; j < n; j++) { + a[ni + j] = b[j]; + } + } +} +``` + + +```c +void test_reduction_opt(double *a, double *b, long i, long n) { + int i, j; + int ni = 0; + for (i = 0;i < n; i++) { + for (j = 0; j < n; j++) { + a[ni + j] = b[j]; + } + ni += n; + } +} +``` +
+ +### Share Common Subexpressions + +Reuse portations of expressions + +GCC will do this with `-O1` + + + + +
DefaultOptimized
+ +```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_3.o]} +double test_scs(double* val, long i, long j, long n) { + double up, down, left, right; + + up = val[(i - 1) * n + j]; + down = val[(i + 1) * n + j]; + left = val[i * n + (j - 1)]; + right = val[i * n + (j + 1)]; + return up + down + left + right; +} +``` + + +```c +double test_scs_opt(double *a, double *b, long i, long n) { + double up, down, left, right; + + long inj = i * n + j; + + up = a[inj - n]; + down = a[inj + n]; + left = b[inj - 1]; + right = b[inj + 1]; + return up + down + left + right; +} +``` +
+ +```sh {cmd hide} +while ! [ -r 4_3.o ]; do sleep .1; done; objdump -d 4_3.o +``` + +Above dump shows only one `imul`, which shows that share common subexpressions are applied. + +### Remove Unnecessary Procedure + +Think with your intuition. + +## Optimization Blockers + +Compilers cannot always optimize your code. + +```c +void lower(char *s) { + size_t i; + for (i = 0; i < strlen(s); i++) { + if (s[i] >= 'A' && s[i] <= 'Z') { + s[i] -= ('A' - 'a'); + } + } +} +``` + +Above code's performance is bad. time quadruples when double string length. +Because `strlen` is executed on every loop. so `strlen` is $O(n)$, therefore overall performance of `lower` is $O(n^2)$ + +Therefore we optimized by Code Motion by moving the calculation length parts to out of the loop. + +```c +void lower(char *s) { + size_t i; + size_t len = strlen(s); + for (i = 0; i < len; i++) { + if (s[i] >= 'A' && s[i] <= 'Z') { + s[i] -= ('A' - 'a'); + } + } +} +``` + +### #1 Procedure Calls + +Procedure may have side effects. and Function may not return same value for given arguments. + +So compiler treats procedure call as a black box. Weak optimizations near them. Therefore strong optimizations like **Code Motion** are not applied. + +In order to apply strong optimizations, First, use of inline function with `-O1` option, or **do your self**. + +### Memory Aliasing + +```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_4.o]} +void sum_rows(double *a, double *b, long n) { + long i, j; + for (i = 0; i < n; i++) { + b[i] = 0; + for (j = 0; j < n; j++) { + b[i] += a[i * n + j]; + } + } +} +``` +```sh {cmd hide} +while ! [ -r 4_4.o ]; do sleep .1; done; objdump -d 4_4.o -Msuffix +``` + +Compiler leave `b[i]` on every iteration. Because compiler must consider possibility that the updates will affect program behavior. (`b` and `a` is shared, memory aliasing) + +Memory aliasing means two different memory references specify single location. +in C, it is easy to have happen. because address arithmetic and direct access to storage structures. + +```c {cmd=gcc args=[-O1 -x c -c $input_file -o 4_5.o]} +void sum_rows(double *a, double *b, long n) { + long i, j; + for (i = 0; i < n; i++) { + double val = 0; + for (j = 0; j < n; j++) { + val += a[i * n + j]; + } + b[i] = val; + } +} +``` +```sh {cmd hide} +while ! [ -r 4_5.o ]; do sleep .1; done; objdump -d 4_5.o -Msuffix +``` + +By introducing local local variables, we can easy to get optimized code. + +## Exploiting Instruction-Level Parallelism(ILP) + +Execute multiple instructions at the same time. it can reduce average instruction cycle, which needs general understanding of modern processor design: HW can execute many operations in parallel. + +* performance limited by data dependency + +simple transformations can yield dramatic performance improvement. + +### Superscalar Processors + +Issue and Execute Multiple Instructions in one cycle. + +pipelining -> data dependency. + + +for example Haswell CPU Functional Units +* 2 load +* 1 store +* 4 integer +* 2 FP mult +* 1 FP add +* 1 FP div +* 1 int mult + +### Programming with AVX2 + +YMM register: 256bit, total 16 registers. + +**SIMD Operations** + +for single precision +`vaddps %ymm0, %ymm1, %ymm1`: + +for double precision + +`vaddpd %ymm0, %ymm1, %ymm1`