[Lecture 14] 멀티쓰레딩 VIII - Atomic Variables and Operations

#Introduction

적절한 락(lock)이 사용될 때, 여러 스레드는 공유 주소 공간에 접근할 수 있고 공유 메모리 멀티스레딩 프로그래밍 모델의 기본 직관에 해당하는 이러한 공유 변수의 동일한 값을 볼 수 있다.

데이터 레이스가 없는 프로그램만이 순차적 일관성을 제공한다(모든 스레드에 의해 "단계"의 관점에서 모든 메모리 동작의 순차적 순서가 관찰되며 프로그램의 순서와 일치한다)

이를 보장하는 전통적인 방법은 lock사용, 세마포어나 조건 변수를 사용하는 것이었다
- lock: lock을 획득하고 중요 섹션에 진입하는 두 번째 스레드에는 lock을 획득한 첫 번째 스레드에서 수행한 모든 업데이트가 표시된다
- 세마포어/조건 변수: 대기 호출에서 반환되는 스레드는 대기에서 스레드를 반환하게 된 신호 작업 전에 수행된 모든 업데이트를 확인한다.
데이터 race의 자유
- 스레드 2개가 서로 다른 순서로 업데이트를 본다. 스레드 1이 A를 업데이트한 다음 B를 업데이트하고 스레드 2가 B의 새 값과 A의 이전 값을 확인한다.
- 데이터 race의 자유는 순차적인 일관된 주문만 존재하도록 규정하고 있으며, 어떤 주문인지 알 수 없다.

만약 lock이 너무 느리면?

컴파일러와 프로세서를 제한하는 다른 방법들이 있을까? 원자 변수라는 것이 존재한다.

C11/C++11에서는 원자 변수를 지원한다. C/C++에서 원자 변수는 "동기 변수"이다. 이러한 변수들의 사용은 이러한 변수들에 대한 접근 전후의 메모리 연산에 대해 관찰된 특정 인터리빙을 허용하지 않는다. 또한 스레드를 BLOCKED 상태로 만들지 않는다.

동기화 변수에 대한 동시 액세스는 레이스로 간주되지 않는다
이러한 변수는 읽기-수정-쓰기 작업에서도 원자적으로 업데이트될 수 있다
기본적으로 (메모리 순서) 원자 접근 사이의 비원자 변수에 대한 접근과 접근을 위해 순차적으로 일관된 순서가 존재함을 보장한다

#failure of busy-waiting “done” flag check

//waitingonaflag.c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
bool done;
int x;
void thread1() {
    x = rand() % 2;
    done = true;
}
int thread2() {
    while (!done) { }
    return x;
}

//compiled with gcc 7.5
    thread1:
    subq $8, %rsp
    call rand@PLT
    movl %eax, %edx
    movb $1, done(%rip) # done = true
    shrl $31, %edx
    addl %edx, %eax
    andl $1, %eax
    subl %edx, %eax
    movl %eax, x(%rip) # x = ...
    addq $8, %rsp
    ret
    
thread2:
    cmpb $0, done(%rip)
    jne .L8 # if !done goto L8
.L7: # else: loop forever
	jmp .L7
.L8:
    movl x(%rip), %eax
    ret

컴파일러는 루프를 if로 대체한다. 또한,

컴파일러는 statements들을 reorder한다

#C11 Atomics

//waitingonaflag-atomic.c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdatomic.h>

atomic_bool done;
int x;

void thread1() {
    x = rand() % 2;
    done = true;
}
int thread2() {
    while (!done) { }
    return x;
}

//compiled with gcc 7.5
    thread1:
    subq $8, %rsp
    call rand@PLT
    movl %eax, %edx
    shrl $31, %edx
    addl %edx, %eax
    andl $1, %eax
    subl %edx, %eax
    movl %eax, x(%rip)
    movb $1, done(%rip)
    mfence
    addq $8, %rsp
    ret
    
thread2:
.L5:
    movzbl done(%rip), %eax
    testb %al, %al
    je .L5
    movl x(%rip), %eax
    ret

#C11 Atomics (ARM64)

//without atomics
thread1:
    stp x29, x30, [sp, -16]!
    mov x29, sp
    bl rand
    cmp w0, 0
    adrp x2, :got:x
    adrp x1, :got:done
    and w0, w0, 1
    mov w3, 1
    ldr x2, [x2, #:got_lo12:x]
    csneg w0, w0, w0, ge
    ldr x1, [x1, #:got_lo12:done]
    str w0, [x2]
    strb w3, [x1]
    ldp x29, x30, [sp], 16
    ret
thread2:
    adrp x0, :got:done
    ldr x0, [x0, #:got_lo12:done]
    ldrb w0, [x0]
    cbnz w0, .L8
.L7:
	b .L7
.L8:
    adrp x0, :got:x
    ldr x0, [x0, #:got_lo12:x]
    ldr w0, [x0]
    ret

//with atomics
thread1:
    stp x29, x30, [sp, -16]!
    mov x29, sp
    bl rand
    cmp w0, 0
    adrp x2, :got:x
    adrp x1, :got:done
    and w0, w0, 1
    ldr x2, [x2, #:got_lo12:x]
    csneg w0, w0, w0, ge
    ldr x1, [x1, #:got_lo12:done]
    str w0, [x2]
    mov w0, 1
    stlrb w0, [x1]
    ldp x29, x30, [sp], 16
    ret
thread2:
    adrp x1, :got:done
    ldr x1, [x1, #:got_lo12:done]
.L5:
    ldarb w0, [x1]
    tst w0, 255
    beq .L5
    adrp x0, :got:x
    ldr x0, [x0, #:got_lo12:x]
    ldr w0, [x0]
    ret

#Simple Use Cases

이전에는 안전하지 않았던 특정 액세스를 지금 수행할 수 있다

//Checking Write-Once Variables
atomic_bool gameover;
// ....
if (gameover) {
// game is over and it is safe to access the results
}

//Double Checked Locking Idiom
pthread_mutex_lock lock;
_Atomic struct sometype * s;

// Goal
struct sometype *
makeSingleton() {
	if (s == NULL) { // access without lock
		pthread_mutex_lock(&lock);
		if (s == NULL) { // double-check
            tmp = malloc(...);
            initialize(tmp);
            s = tmp; // s being atomic, all writes
                // inside initialize are seen
                // when another thread sees
                // s != NULL
		}
		pthread_mutex_unlock(&lock);
	}
	return s;
}

#Read-modify-write Operations

//atomicupdates.c
#include <stdatomic.h>

atomic_int i;
atomic_ulong d;

void atomic_updates()
{
    i = i + 5; // non-atomic
    i += 5;
    d /= 2;
}

//atomicupdates.s
atomic_updates:
    movl i(%rip), %eax
    addl $5, %eax
    movl %eax, i(%rip)
    mfence
    lock addl $5, i(%rip)
    movq d(%rip), %rax
.L2:
    movq %rax, %rdx
    shrq %rdx
    lock cmpxchgq %rdx, d(%rip)
    jne .L2
    rep ret

컴파일러에 의해 특정 작업이 원자성 업데이트로 전환된다
- 예: a++, a *= 2
- 그러나 a = a + 1은 예외이다
- 아키텍처에서 제공하는 원자 명령을 사용하거나 원자 비교 및 교환 또는 동등한 것에 기반한 루프를 사용한다

#Combining Atomics with Lock-based Synchronization

//Atomics + Condition Variables
atomic_bool gameover;
// ...
// BUG: Checking the condition `gameover`
// is not atomic with respect to calling pthread_cond_wait
while (!gameover) {
    pthread_mutex_lock(&lock);
    pthread_cond_wait(&cond, &lock);
    pthread_mutex_unlock(&lock);
}

이진 계측기 기반 레이스 감지 도구(Helgrind, DRD)는 일반적으로 원자성을 인식하지 못한다. 그렇다면 아래 두 개의 코드 중에 어떤 것을 써야 할까?

atomic_int inqueuecount; // count of items in queue
pthread_mutex_lock queuelock; // queue lock

void enqueue(struct item *item)
{
    inqueuecount++;
    pthread_mutex_lock(&queuelock);
    add_to_queue(item);
    pthread_mutex_unlock(&queuelock);
}

아니면

int inqueuecount;
pthread_mutex_lock queuelock;

void enqueue(struct item *item)
{
    pthread_mutex_lock(&queuelock);
    inqueuecount++;
    add_to_queue(item);
    pthread_mutex_unlock(&queuelock);
}

#Lock-free Synchronization

lock은 많은 시나리오에서 충분히 잘 작동하고 모든 종류의 원자 수정을 구현하기 위한 일반적인 시설을 제공하지만, 몇 가지 단점이 있다
- 동기화가 너무 거친 경우 CPU 활용률이 감소할 수 있는 가능성
- 너무 세밀한 경우 교착 상태에 빠질 수 있는 가능성 증가
- 레이스가 심한 경우 성능 저하 가능성
- 우선 순위 역전 가능성(저우선순위 스레드는 스레드가 원하는 lock을 유지하여 우선 순위가 높은 스레드를 유지함)
- Convoying: Convoying 현상이 발생하면, 많은 스레드들이 락을 기다리는 동안 대기 상태에 머무르게 된다. 이로 인해 CPU와 같은 시스템 자원의 활용도가 저하될 수 있다
- lock를 가지고 있는 스레드의 비동기 종료(킬)에 대한 적절한 지원이 없음
- Unix signals 잘 연동안됨
이러한 단점들은 원자 연산을 사용하여 구현되는 특정 "lock-free (락을 사용하지 않는)" 동기화 알고리즘을 만들었다.
- 데이터 구조: lock-free 스택, list 등
- 예: java.util.concurrent.ConcurrentHashMap

#Addendum: the volatile keyword

C/C++에서 volatile은 어떠한 접근(읽기 또는 쓰기)도 부작용으로 간주되어야 한다고 말하므로 컴파일러는 이를 최적화하거나 재정렬할 수 없다
예를 들어 메모리 매핑 I/O에 적합하다
아토믹과 달리 다른 스레드가 보는 것에 영향을 미치지 않으므로(펜스를 도입하거나 로드/스토어를 획득/해제하지 않음), 스레드 간 통신에 사용할 수 없다
- C11 전에 프로그래머들은 컴파일러가 원하는 코드를 생성하도록 하기 위해 신뢰할 수 없는 volatile을 사용을 시도 했다
자바에서 volatile은 기본 설정 memory_order_seq_cst는 C11/C++11의 원자 변수와 유사하지만 원자 read-modify-writes를 할 수 있는 기능은 없다는 차이점이 있다.

#Atomic Variable in Java

#Atomic vs Non-Atomic Operations

'Computer Science > Computer Systems' 카테고리의 다른 글

[Lecture 18] Virtual Memory / 가상 메모리: Principles and Mechanisms (0)	2022.11.19
[Lecture 15] Dynamic Memory Management (malloc/free) (1)	2022.11.06
[Lecture 13] 멀티쓰레딩 VII - Performance (0)	2022.10.19
[Lecture 12] 멀티쓰레딩 VI - Atomicity Violations (0)	2022.10.19
[Lecture 11] 멀티쓰레딩 V - Deadlock (0)	2022.10.19

CS 노트

[Lecture 14] 멀티쓰레딩 VIII - Atomic Variables and Operations

#Introduction

#failure of busy-waiting “done” flag check

#C11 Atomics

#C11 Atomics (ARM64)

#Simple Use Cases

#Read-modify-write Operations

#Combining Atomics with Lock-based Synchronization

#Lock-free Synchronization

#Addendum: the volatile keyword

#Atomic Variable in Java

#Atomic vs Non-Atomic Operations

'Computer Science > Computer Systems' 카테고리의 다른 글

티스토리툴바

[Lecture 14] 멀티쓰레딩 VIII - Atomic Variables and Operations

#Introduction

#failure of busy-waiting “done” flag check

#C11 Atomics

#C11 Atomics (ARM64)

#Simple Use Cases

#Read-modify-write Operations

#Combining Atomics with Lock-based Synchronization

#Lock-free Synchronization

#Addendum: the volatile keyword

#Atomic Variable in Java

#Atomic vs Non-Atomic Operations

'Computer Science > Computer Systems' 카테고리의 다른 글

'Computer Science/Computer Systems' Related Articles

티스토리툴바