riscv-linux-remote-sfence

介绍linux中riscv架构下,多核sfence的做法。

在RISCV架构下,仅定义了sfence.vma指令用来刷新当前CPU下的TLB。然而在多核系统中,如果一个hart修改了页表,执行sfence.vma之后,仅仅是刷新了当前hart的TLB,无法刷新其他hart的TLB。所以在linux必须要有一种机制来刷新remote hart的TLB,本文来介绍linux是如何做到的。

在linux源码里arch/riscv/mm/tlbflush.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
unsigned long size, unsigned long stride)
{
struct cpumask *cmask = mm_cpumask(mm);
struct cpumask hmask;
unsigned int cpuid;
bool broadcast;

if (cpumask_empty(cmask))
return;

cpuid = get_cpu();
/* check if the tlbflush needs to be sent to other CPUs */
broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;
if (static_branch_unlikely(&use_asid_allocator)) {
unsigned long asid = atomic_long_read(&mm->context.id);

if (broadcast) {
riscv_cpuid_to_hartid_mask(cmask, &hmask);
sbi_remote_sfence_vma_asid(cpumask_bits(&hmask),
start, size, asid);
} else if (size <= stride) {
local_flush_tlb_page_asid(start, asid);
} else {
local_flush_tlb_all_asid(asid);
}
} else {
if (broadcast) {
riscv_cpuid_to_hartid_mask(cmask, &hmask);
sbi_remote_sfence_vma(cpumask_bits(&hmask),
start, size);
} else if (size <= stride) {
local_flush_tlb_page(start);
} else {
local_flush_tlb_all();
}
}

put_cpu();
}

这是linux中刷新tlb会调用的底层函数,其中:

1
broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;

nr_cpu_ids: 当前所有可处于联机状态的CPU总数。

cpumask_any_but:TODO??

从这一行可以看出当tlbflush只需要刷新自己cpu时,broadcast为0,直接调用local_flush,也就是直接调用sfence指令来刷新本cpu的TLB。而当要刷新其他cpu时,broadcast为1,则需要调用SBI call,让SBI来通知其他cpu。

在linux源码里arch/riscv/kernel/sbi.c中调用SBI call。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/**
* sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
* harts for the specified virtual address range.
* @hart_mask: A cpu mask containing all the target harts.
* @start: Start of the virtual address
* @size: Total size of the virtual address range.
*
* Return: 0 on success, appropriate linux error code otherwise.
*/
int sbi_remote_sfence_vma(const unsigned long *hart_mask,
unsigned long start,
unsigned long size)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
hart_mask, start, size, 0, 0);
}
EXPORT_SYMBOL(sbi_remote_sfence_vma);

在opensbi源码里,lib/sbi/sbi_ecall_replace.c中会处理这个call。

1
2
3
4
5
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
SBI_TLB_INFO_INIT(&tlb_info, regs->a2, regs->a3, 0, 0,
sbi_tlb_local_sfence_vma, source_hart);
ret = sbi_tlb_request(regs->a0, regs->a1, &tlb_info);
break;

在opensbi源码里,lib/sbi/sbi_tlb.c 中调用IPI。

1
2
3
4
5
6
7
8
9
int sbi_tlb_request(ulong hmask, ulong hbase, struct sbi_tlb_info *tinfo)
{
if (!tinfo->local_fn)
return SBI_EINVAL;

tlb_pmu_incr_fw_ctr(tinfo);

return sbi_ipi_send_many(hmask, hbase, tlb_event, tinfo);
}

在opensbi源码里,lib/sbi/sbi_ipi.c 中调用device的IPI。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
static int sbi_ipi_send(struct sbi_scratch *scratch, u32 remote_hartid,
u32 event, void *data)
{
int ret;
struct sbi_scratch *remote_scratch = NULL;
struct sbi_ipi_data *ipi_data;
const struct sbi_ipi_event_ops *ipi_ops;

if ((SBI_IPI_EVENT_MAX <= event) ||
!ipi_ops_array[event])
return SBI_EINVAL;
ipi_ops = ipi_ops_array[event];

remote_scratch = sbi_hartid_to_scratch(remote_hartid);
if (!remote_scratch)
return SBI_EINVAL;

ipi_data = sbi_scratch_offset_ptr(remote_scratch, ipi_data_off);

if (ipi_ops->update) {
ret = ipi_ops->update(scratch, remote_scratch,
remote_hartid, data);
if (ret < 0)
return ret;
}

/*
* Set IPI type on remote hart's scratch area and
* trigger the interrupt
*/
atomic_raw_set_bit(event, &ipi_data->ipi_type);
smp_wmb();

if (ipi_dev && ipi_dev->ipi_send)
ipi_dev->ipi_send(remote_hartid);

sbi_pmu_ctr_incr_fw(SBI_PMU_FW_IPI_SENT);

if (ipi_ops->sync)
ipi_ops->sync(scratch);

return 0;
}

/**
* As this this function only handlers scalar values of hart mask, it must be
* set to all online harts if the intention is to send IPIs to all the harts.
* If hmask is zero, no IPIs will be sent.
*/
int sbi_ipi_send_many(ulong hmask, ulong hbase, u32 event, void *data)
{
int rc;
ulong i, m;
struct sbi_domain *dom = sbi_domain_thishart_ptr();
struct sbi_scratch *scratch = sbi_scratch_thishart_ptr();

if (hbase != -1UL) {
rc = sbi_hsm_hart_interruptible_mask(dom, hbase, &m);
if (rc)
return rc;
m &= hmask;

/* Send IPIs */
for (i = hbase; m; i++, m >>= 1) {
if (m & 1UL)
sbi_ipi_send(scratch, i, event, data);
}
} else {
hbase = 0;
while (!sbi_hsm_hart_interruptible_mask(dom, hbase, &m)) {
/* Send IPIs */
for (i = hbase; m; i++, m >>= 1) {
if (m & 1UL)
sbi_ipi_send(scratch, i, event, data);
}
hbase += BITS_PER_LONG;
}
}

return 0;
}

最后应该会调到,lib/utils/ipi/aclint_mswi.c ,最终往msip里写值通知另一个hart。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static void mswi_ipi_send(u32 target_hart)
{
u32 *msip;
struct aclint_mswi_data *mswi;

if (SBI_HARTMASK_MAX_BITS <= target_hart)
return;
mswi = mswi_hartid2data[target_hart];
if (!mswi)
return;

/* Set ACLINT IPI */
msip = (void *)mswi->addr;
writel(1, &msip[target_hart - mswi->first_hartid]);
}

总结来看,linux是采用SBI call + IPI 的方式来实现remote sfence的。

Reference

https://github.com/riscv/riscv-sbi-doc/issues/42