2.中断过程
void qemu_set_irq(qemu_irq irq, int level)
{
if (!irq)
return;
irq->handler(irq->opaque, irq->n,
level);
}
设置中断控制器hander,大致分为三种情况
1.cpu_irq的hander===> pic_irq_request
2.内核模拟中断控制器的hander===>kvm_i8259_set_irq
3.用户模拟中断控制器的hander===>i8259_set_irq
/* PC hardware initialisation */
static void pc_init1()
{
cpu_irq = qemu_allocate_irqs(pic_irq_request, NULL,
1);
#ifdef KVM_CAP_IRQCHIP
if (kvm_enabled() &&
kvm_irqchip_in_kernel()) {
isa_irq_state =
qemu_mallocz(sizeof(*isa_irq_state));
isa_irq = i8259 =
kvm_i8259_init(cpu_irq[0]);
} else
#endif
{
i8259 =
i8259_init(cpu_irq[0]);
isa_irq_state =
qemu_mallocz(sizeof(*isa_irq_state));
isa_irq_state->i8259
= i8259;
isa_irq =
qemu_allocate_irqs(isa_irq_handler, isa_irq_state, 24);
}
先研究用户空间中断控制器的中断发生过程
static void i8259_set_irq(void *opaque, int irq, int level)
{
PicState2 *s = opaque;
pic_set_irq1(&s->pics[irq >> 3],
irq & 7, level);
pic_update_irq(s);
}
中断触发方式分为电平触发和边沿触发,isa设备大多数采用边沿触发,pci设备采用电平触发。
假如采用边沿触发,如果leveld等于1,并且没有等待的中断请求(没有pending中断请求),设置中断请求寄存器为1,另外设置pending中断请求为1.
如果有pending中断请求,并不设置中断请求寄存器,可见允许中断丢失。
/* set irq level. If an edge is detected, then the IRR is set to 1
*/
static inline void pic_set_irq1(PicState *s, int irq, int
level)
{
int mask;
mask = 1 <
/* level triggered
*/
if (level) {
s->irr |= mask;
s->last_irr |= mask;
} else {
s->irr &= ~mask;
s->last_irr &= ~mask;
}
} else {
/* edge triggered */
if (level) {
if ((s->last_irr & mask) == 0)
s->irr |= mask;
s->last_irr |= mask;
} else {
s->last_irr &= ~mask;
}
}
}
每次有中断请求,必须调用该函数。该函数调用造成中断嵌套。另外必须话必须注入中断。什么情况下是必须呢?具体可参照pic_get_irq()函数。
这个函数对产生中断优先级和正在处理中断优先级进行比较,如果大于话,注入请求中断。注入中断时机由qemu_irq_raise触发的,下面列出该函数。
/* raise irq to CPU if necessary. must be called every time the
active
irq may change */
void pic_update_irq(PicState2 *s)
{
/* look at requested irq */
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
qemu_irq_raise(s->parent_irq);
}
}
不要认为,好像又循环到中断入口了,实际没有,关键在于参数s->parent_irq,该参数实际调用cpu_irq的hander===>
pic_irq_request
static inline void qemu_irq_raise(qemu_irq irq)
{
qemu_set_irq(irq, 1);
}
目前只研究用户态模拟中断控制器i8259(剔除KVM模拟和apic中断控制器),
cpu_interrupt函数实际中断目前虚拟处理器运行,为硬件中断注入做好准备,目前就是中断注入时机。如何中断(暂停)虚拟处理器运行呢,通过该函数pthread_kill(env->kvm_cpu_state.thread,
SIG_IPI)中断处理器运行;
static void pic_irq_request(void *opaque, int irq, int
level)
{
CPUState *env = first_cpu;
if (level)
cpu_interrupt(env, CPU_INTERRUPT_HARD);
else
cpu_reset_interrupt(env, CPU_INTERRUPT_HARD);
}
}
中断注入
中断注入负责将虚拟中断控制器采集的中断请求注入到虚拟处理器。需要处理两个问题,什么时候注入,如何注入?
static int kvm_main_loop_cpu(CPUState *env)
{
while (1) {
int run_cpu =
!is_cpu_stopped(env);
if (run_cpu &&
!kvm_irqchip_in_kernel()) {
process_irqchip_events(env);
run_cpu = !env->halted;
}
if (run_cpu) {
kvm_cpu_exec(env);
kvm_main_loop_wait(env, 0);
} else {
kvm_main_loop_wait(env, 1000);
}
}
pthread_mutex_unlock(&qemu_mutex);
return 0;
}
如果中断控制器不是内核空间模拟(用户空间模拟),进行中断注入。
kvm_main_loop_cpu-->kvm_cpu_exec-->kvm_run
int kvm_run(CPUState *env)
{
#if !defined(__s390__)
if (!kvm->irqchip_in_kernel)
run->request_interrupt_window =
kvm_arch_try_push_interrupts(env);
#endif
}
1.首先满足三个条件
1)内核kvm准备好了接受中断注入
2)有中断请求并且为硬件中断请求
3)虚拟处理器运行中断(开中断)
2.获取中断请求号
3.kvm注入中断请求
int kvm_arch_try_push_interrupts(void *opaque)
{
CPUState *env = cpu_single_env;
int r, irq;
if (kvm_is_ready_for_interrupt_injection(env)
&&
(env->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags &
IF_MASK)) {
env->interrupt_request &= ~CPU_INTERRUPT_HARD;
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
r = kvm_inject_irq(env, irq);
if (r <0)
printf("cpu %d fail inject %x\n", env->cpu_index, irq);
}
}
return (env->interrupt_request &
CPU_INTERRUPT_HARD) != 0;
}
//////////////////////////////////////////////////////////////////
内核空间中断采集
中断控制器由两种8259和apic,这两个设备在用户空间模拟过程,在上面已分析。接下来看一下在内核态模拟,我们暂且称之为内核空间中断采集。
static void kvm_i8259_set_irq(void *opaque, int irq, int level)
{
int pic_ret;
if (kvm_set_irq(irq, level, &pic_ret)) {
if (pic_ret != 0)
apic_set_irq_delivered();
return;
}
}
进入通过/dev/kvm接口,进入内核。
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
goto out;
}
r = 0;
}
break;
}
}
硬件中断进入8259和apic中断控制器
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int
level)
{
struct
kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
int ret = -1, i =
0;
struct
kvm_irq_routing_table *irq_rt;
struct hlist_node
*n;
trace_kvm_set_irq(irq,
level, irq_source_id);
/* Not possible to
detect if the guest uses the PIC or the
* IOAPIC. So
set the bit in both. The guest will ignore
* writes to the
unused one.
*/
rcu_read_lock();
irq_rt =
rcu_dereference(kvm->irq_routing);
if (irq <
irq_rt->nr_rt_entries)
hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
irq_set[i++] = *e;
rcu_read_unlock();
while(i--) {
int r;
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
if (r <0)
continue;
ret = r + ((ret <0) ? 0 : ret);
}
return ret;
}
为了简单化,我们只分析8259中断控制器
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry
*e,
struct kvm *kvm, int irq_source_id, int level)
{
#ifdef CONFIG_X86
struct kvm_pic *pic =
pic_irqchip(kvm);
level =
kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
irq_source_id, level);
return
kvm_pic_set_irq(pic, e->irqchip.pin, level);
#else
return -1;
#endif
}
这里8259中断控制器,如用户态过程类似,可参考上面分析
int kvm_pic_set_irq(void *opaque, int irq, int level)
{
struct kvm_pic *s =
opaque;
int ret = -1;
pic_lock(s);
if (irq >= 0
&& irq
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq
>> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
pic_unlock(s);
return ret;
}
如果收到中断响应
/*
* callback when PIC0 irq status changed
*/
static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm =
opaque;
struct kvm_vcpu *vcpu =
kvm->bsp_vcpu;
struct kvm_pic *s =
pic_irqchip(kvm);
int irq =
pic_get_irq(&s->pics[0]);
s->output =
level;
if (vcpu &&
level && (s->pics[0].isr_ack & (1 <
}
}
中断注入
中断注入实际是向客户机CPU注入一个事件,这个事件包括异常和外部中断和NMI。异常我们一般看作为同步,中断被认为异步。
硬件具体实现就中断注入实际就是设置VMCS中字段VM-Entry
interruption-infomation字段。中断注入实际在VM运行前完成的,具体如下:
static int vcpu_enter_guest(struct kvm_vcpu *vcpu) {
inject_pending_event(vcpu);
}
vcpu_enter_guest函数运行虚拟机,运行虚拟机已省掉。中断注入实际在VM运行前,接下来看看具体如何注入。
static void inject_pending_event(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
return;
}
if (vcpu->arch.interrupt.pending) {
kvm_x86_ops->set_irq(vcpu);
return;
}
/* try to inject new event if pending
*/
if
(vcpu->arch.nmi_pending) {
if (kvm_x86_ops->nmi_allowed(vcpu)) {
vcpu->arch.nmi_pending = false;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
} else if
(kvm_cpu_has_interrupt(vcpu)) {
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
kvm_x86_ops->set_irq(vcpu);
}
}
}
首先用户态实现中断控制器,不可屏蔽中断和其他中断注入过程。用户态中断采集在qemu中实现
判断是否有等待注入中断,存在话立即注入
接下来内核态模拟的中断控制器,中断注入过程,不可屏蔽中断和其他中断注入过程。
判断KVM内核态是否有不可屏蔽中断,有并且客户机cpu允许中断话,注入中断到客户机cpu中。
判断KVM内核态是否有中断,有中断并且客户机cpu允许中断话,获取优先级高中断进行排队,注入中断到客户机cpu中。
另外一个情况,如果有中断但是客户机不允许中断,只能等待下一下中断注入。如果下一次有更高级别中断发生,该中断还是不能注入而选择更高级别中断注入。
/*
* check if there is pending interrupt without
* intack.
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
struct kvm_pic *s;
if
(!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.pending;
if
(kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
s =
pic_irqchip(v->kvm);
/* PIC */
return s->output;
} else
return 0;
}
return 1;
}
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
int vector =
kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic =
vcpu->arch.apic;
if (vector == -1)
return -1;
apic_set_vector(vector,
apic->regs + APIC_ISR);
apic_update_ppr(apic);
apic_clear_irr(vector,
apic);
return vector;
}
找到中断向量,设置ISR,清除中断请求寄存器。
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr;
int isr;
tpr =
apic_get_reg(apic, APIC_TASKPRI);
isr =
apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr
: 0;
if ((tpr & 0xf0)
>= (isrv & 0xf0))
ppr = tpr & 0xff;
else
ppr = isrv & 0xf0;
apic_debug("vlapic
%p, ppr 0x%x, isr 0x%x, isrv 0x%x",
apic, ppr, isr, isrv);
apic_set_reg(apic,
APIC_PROCPRI, ppr);
}
获取tpr寄存器内容,查询当前待处理请求向量,TPR 寄存器接收 0~15 共 16 个值,对应 16 个 CPU
规定的中断优先级级别,值越大优
先级越高。CPU 只处理比 TPR 中值优先级别更高的中断。将Pending 在 IRR
上的中断是否发送给 CPU
TPR,task priority register,任务优先级寄存器,它确定当前 CPU 可处理什么优先级别
范围内的中断。CPU 只处理比 TPR
中值优先级别更高的中断.
PPR,Processor priority register,处理器优先级寄存器。该寄存器决定当前 CPU 正在处
理的中断的优先级级别,以确定一个 Pending 在 IRR
上的中断是否发送给 CPU。与 TPR 不
同,它的值由 CPU 写而不是软件写。
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
return
(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
判断客户机中断标志寄存器和判断中断能力信息
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8
vector,
bool soft)
{
vcpu->arch.interrupt.pending = true;
vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx =
to_vmx(vcpu);
uint32_t intr;
int irq =
vcpu->arch.interrupt.nr;
trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
intr = irq | INTR_INFO_VALID_MASK;
if
(vcpu->arch.interrupt.soft) {
intr |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
}
由结构体成员vcpu,获取包含该成员结构体vmx, 这个转换由container_of(ptr, type, member)
实现的,由兴趣可以自己分析一下。
接下来设置VM-Entry interruption-infomation字段,字段格式如下:
0-7为中断向量
8-10位为中断类型
11错误
12-30为保留
31为有效
设置中断信息字段的中断向量,并将中断信息字段最高位(31)为置1,1表明中断有效。
根据中断向量类型为软中断或者硬件中断,设置中断信息字段。
最后把写入中断信息字段到VMCS的数据域,从而完成中断注入。
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int
level)
{
u32 old_irr;
u32 mask = 1 <<
irq;
union
kvm_ioapic_redirect_entry entry;
int ret = 1;
spin_lock(&ioapic->lock);
old_irr =
ioapic->irr;
if (irq >= 0
&& irq
level ^= entry.fields.polarity;
if (!level)
ioapic->irr &= ~mask;
else {
int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
ioapic->irr |= mask;
if ((edge && old_irr != ioapic->irr) ||
(!edge && !entry.fields.remote_irr))
ret = ioapic_service(ioapic, irq);
else
ret = 0; /* report coalesced interrupt */
}
trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
}
spin_unlock(&ioapic->lock);
return ret;
}
注意:中断请求寄存器表示已接受的中断,但是尚未提交
获取中断请求寄存器内容。
判断irq引脚线是否小于24
获取相应引脚重定向表内容,触发电平异或中断管脚的极性,主要因为entry.fields.polarity为0表示高电平有效,level为1将表示产生中断
获取触发模式,设置中断请求寄存器,如果为边沿触发并且没有排队中断,当中断是
level 触发时,LAPIC 接收了该中断,remote_irr内容为1,LAPIC 写 EOI
时,remote_irr内容为0
static int ioapic_service(struct kvm_ioapic *ioapic, unsigned
int idx)
{
union
kvm_ioapic_redirect_entry *pent;
int injected = -1;
pent =
&ioapic->redirtbl[idx];
if
(!pent->fields.mask) {
injected = ioapic_deliver(ioapic, idx);
if (injected && pent->fields.trig_mode ==
IOAPIC_LEVEL_TRIG)
pent->fields.remote_irr = 1;
}
return injected;
}
如果中断屏蔽位没有设置,允许中断。
static int ioapic_deliver(struct kvm_ioapic *ioapic, int
irq)
{
union
kvm_ioapic_redirect_entry *entry =
&ioapic->redirtbl[irq];
struct kvm_lapic_irq
irqe;
ioapic_debug("dest=%x
dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
entry->fields.dest, entry->fields.dest_mode,
entry->fields.delivery_mode, entry->fields.vector,
entry->fields.trig_mode);
irqe.dest_id =
entry->fields.dest_id;
irqe.vector =
entry->fields.vector;
irqe.dest_mode =
entry->fields.dest_mode;
irqe.trig_mode =
entry->fields.trig_mode;
irqe.delivery_mode =
entry->fields.delivery_mode <<8;
irqe.level = 1;
irqe.shorthand = 0;
#ifdef CONFIG_X86
/* Always delivery PIT
interrupt to vcpu 0 */
if (irq == 0) {
irqe.dest_mode = 0; /* Physical mode. */
/* need to read apic_id from apic regiest since
* it can be rewritten */
irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
}
#endif
return
kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
}
获取中断目的地,中断向量,中断目的模式,中断触发模式,中断触发方式,中断触发电平。如果中断中断,重新设置中断目的地。将中断发往目的地local
apic
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic
*src,
struct kvm_lapic_irq *irq)
{
int i, r = -1;
struct kvm_vcpu *vcpu,
*lowest = NULL;
if (irq->dest_mode
== 0 && irq->dest_id == 0xff &&
kvm_is_dm_lowest_prio(irq))
printk(KERN_INFO "kvm: apic: phys broadcast and lowest
prio\n");
kvm_for_each_vcpu(i,
vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (!kvm_is_dm_lowest_prio(irq)) {
if (r <0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq);
} else if (kvm_lapic_enabled(vcpu)) {
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) <0)
lowest = vcpu;
}
}
if (lowest)
r = kvm_apic_set_irq(lowest, irq);
}
如果为Physical Mode,并且dest_id=0xff目的地为广播,是lowest priority,打印警告信息。
寻找匹配vpic,找到话,是Delivery mode 为 lowest priority,这样 IOAPIC
的中断消息由优先级最低的 CPU 接收。不是话,触发中断。
static int __apic_accept_irq(struct kvm_lapic *apic, int
delivery_mode,
int vector, int level, int trig_mode)
{
int result = 0;
struct kvm_vcpu *vcpu =
apic->vcpu;
switch
(delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED:
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
if (trig_mode) {
apic_debug("level trig mode for vector %d", vector);
apic_set_vector(vector, apic->regs + APIC_TMR);
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
result = !apic_test_and_set_irr(vector, apic);
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector, !result);
if (!result) {
if (trig_mode)
apic_debug("level trig mode repeatedly for "
"vector %d", vector);
break;
}
kvm_vcpu_kick(vcpu);
break;
设置TSR 即 Trigger Mode Register,用于表示当前正在处理中断的触发模式。1 为 level,0 为 edge,置中断请求寄存器。
kvm_vcpu_kick 产生处理器中断ipi ,重新调度,为中断注入做准备。
Destination Field,目的字段,R/W(可读写)。根据Destination Filed(见下)值的不同,该字段值的意义不同,它有两个意义: