riscv-linux-remote-sfence

介绍linux中riscv架构下，多核sfence的做法。

在RISCV架构下，仅定义了sfence.vma指令用来刷新当前CPU下的TLB。然而在多核系统中，如果一个hart修改了页表，执行sfence.vma之后，仅仅是刷新了当前hart的TLB，无法刷新其他hart的TLB。所以在linux必须要有一种机制来刷新remote hart的TLB，本文来介绍linux是如何做到的。

在linux源码里arch/riscv/mm/tlbflush.c中

static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
				  unsigned long size, unsigned long stride)
{
	struct cpumask *cmask = mm_cpumask(mm);
	struct cpumask hmask;
	unsigned int cpuid;
	bool broadcast;

	if (cpumask_empty(cmask))
		return;

	cpuid = get_cpu();
	/* check if the tlbflush needs to be sent to other CPUs */
	broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;
	if (static_branch_unlikely(&use_asid_allocator)) {
		unsigned long asid = atomic_long_read(&mm->context.id);

		if (broadcast) {
			riscv_cpuid_to_hartid_mask(cmask, &hmask);
			sbi_remote_sfence_vma_asid(cpumask_bits(&hmask),
						   start, size, asid);
		} else if (size <= stride) {
			local_flush_tlb_page_asid(start, asid);
		} else {
			local_flush_tlb_all_asid(asid);
		}
	} else {
		if (broadcast) {
			riscv_cpuid_to_hartid_mask(cmask, &hmask);
			sbi_remote_sfence_vma(cpumask_bits(&hmask),
					      start, size);
		} else if (size <= stride) {
			local_flush_tlb_page(start);
		} else {
			local_flush_tlb_all();
		}
	}

	put_cpu();
}

这是linux中刷新tlb会调用的底层函数，其中：

1	broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;

nr_cpu_ids: 当前所有可处于联机状态的CPU总数。

cpumask_any_but：TODO？？

从这一行可以看出当tlbflush只需要刷新自己cpu时，broadcast为0，直接调用local_flush，也就是直接调用sfence指令来刷新本cpu的TLB。而当要刷新其他cpu时，broadcast为1，则需要调用SBI call，让SBI来通知其他cpu。

在linux源码里arch/riscv/kernel/sbi.c中调用SBI call。

/**
 * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
 *			     harts for the specified virtual address range.
 * @hart_mask: A cpu mask containing all the target harts.
 * @start: Start of the virtual address
 * @size: Total size of the virtual address range.
 *
 * Return: 0 on success, appropriate linux error code otherwise.
 */
int sbi_remote_sfence_vma(const unsigned long *hart_mask,
			   unsigned long start,
			   unsigned long size)
{
	return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
			    hart_mask, start, size, 0, 0);
}
EXPORT_SYMBOL(sbi_remote_sfence_vma);

在opensbi源码里，lib/sbi/sbi_ecall_replace.c中会处理这个call。

case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
	SBI_TLB_INFO_INIT(&tlb_info, regs->a2, regs->a3, 0, 0,
			  sbi_tlb_local_sfence_vma, source_hart);
	ret = sbi_tlb_request(regs->a0, regs->a1, &tlb_info);
	break;

在opensbi源码里，lib/sbi/sbi_tlb.c 中调用IPI。

int sbi_tlb_request(ulong hmask, ulong hbase, struct sbi_tlb_info *tinfo)
{
	if (!tinfo->local_fn)
		return SBI_EINVAL;

	tlb_pmu_incr_fw_ctr(tinfo);

	return sbi_ipi_send_many(hmask, hbase, tlb_event, tinfo);
}

在opensbi源码里，lib/sbi/sbi_ipi.c 中调用device的IPI。

static int sbi_ipi_send(struct sbi_scratch *scratch, u32 remote_hartid,
			u32 event, void *data)
{
	int ret;
	struct sbi_scratch *remote_scratch = NULL;
	struct sbi_ipi_data *ipi_data;
	const struct sbi_ipi_event_ops *ipi_ops;

	if ((SBI_IPI_EVENT_MAX <= event) ||
	    !ipi_ops_array[event])
		return SBI_EINVAL;
	ipi_ops = ipi_ops_array[event];

	remote_scratch = sbi_hartid_to_scratch(remote_hartid);
	if (!remote_scratch)
		return SBI_EINVAL;

	ipi_data = sbi_scratch_offset_ptr(remote_scratch, ipi_data_off);

	if (ipi_ops->update) {
		ret = ipi_ops->update(scratch, remote_scratch,
				      remote_hartid, data);
		if (ret < 0)
			return ret;
	}

	/*
	 * Set IPI type on remote hart's scratch area and
	 * trigger the interrupt
	 */
	atomic_raw_set_bit(event, &ipi_data->ipi_type);
	smp_wmb();

	if (ipi_dev && ipi_dev->ipi_send)
		ipi_dev->ipi_send(remote_hartid);

	sbi_pmu_ctr_incr_fw(SBI_PMU_FW_IPI_SENT);

	if (ipi_ops->sync)
		ipi_ops->sync(scratch);

	return 0;
}

/**
 * As this this function only handlers scalar values of hart mask, it must be
 * set to all online harts if the intention is to send IPIs to all the harts.
 * If hmask is zero, no IPIs will be sent.
 */
int sbi_ipi_send_many(ulong hmask, ulong hbase, u32 event, void *data)
{
	int rc;
	ulong i, m;
	struct sbi_domain *dom = sbi_domain_thishart_ptr();
	struct sbi_scratch *scratch = sbi_scratch_thishart_ptr();

	if (hbase != -1UL) {
		rc = sbi_hsm_hart_interruptible_mask(dom, hbase, &m);
		if (rc)
			return rc;
		m &= hmask;

		/* Send IPIs */
		for (i = hbase; m; i++, m >>= 1) {
			if (m & 1UL)
				sbi_ipi_send(scratch, i, event, data);
		}
	} else {
		hbase = 0;
		while (!sbi_hsm_hart_interruptible_mask(dom, hbase, &m)) {
			/* Send IPIs */
			for (i = hbase; m; i++, m >>= 1) {
				if (m & 1UL)
					sbi_ipi_send(scratch, i, event, data);
			}
			hbase += BITS_PER_LONG;
		}
	}

	return 0;
}

最后应该会调到，lib/utils/ipi/aclint_mswi.c ，最终往msip里写值通知另一个hart。

static void mswi_ipi_send(u32 target_hart)
{
	u32 *msip;
	struct aclint_mswi_data *mswi;

	if (SBI_HARTMASK_MAX_BITS <= target_hart)
		return;
	mswi = mswi_hartid2data[target_hart];
	if (!mswi)
		return;

	/* Set ACLINT IPI */
	msip = (void *)mswi->addr;
	writel(1, &msip[target_hart - mswi->first_hartid]);
}

总结来看，linux是采用SBI call + IPI 的方式来实现remote sfence的。

Reference

https://github.com/riscv/riscv-sbi-doc/issues/42