1 : /*
2 : * linux/arch/i386/mm/fault.c
3 : *
4 : * Copyright (C) 1995 Linus Torvalds
5 : */
6 :
7 : #include <linux/signal.h>
8 : #include <linux/sched.h>
9 : #include <linux/kernel.h>
10 : #include <linux/errno.h>
11 : #include <linux/string.h>
12 : #include <linux/types.h>
13 : #include <linux/ptrace.h>
14 : #include <linux/mman.h>
15 : #include <linux/mm.h>
16 : #include <linux/smp.h>
17 : #include <linux/smp_lock.h>
18 : #include <linux/interrupt.h>
19 : #include <linux/init.h>
20 : #include <linux/tty.h>
21 : #include <linux/vt_kern.h> /* For unblank_screen() */
22 : #include <linux/highmem.h>
23 : #include <linux/module.h>
24 : #include <linux/kprobes.h>
25 :
26 : #include <asm/system.h>
27 : #include <asm/uaccess.h>
28 : #include <asm/desc.h>
29 : #include <asm/kdebug.h>
30 :
31 : extern void die(const char *,struct pt_regs *,long);
32 :
33 : /*
34 : * Unlock any spinlocks which will prevent us from getting the
35 : * message out
36 : */
37 : void bust_spinlocks(int yes)
38 0 : {
39 0 : int loglevel_save = console_loglevel;
40 :
41 0 : if (yes) {
42 0 : oops_in_progress = 1;
43 0 : return;
44 : }
45 : #ifdef CONFIG_VT
46 0 : unblank_screen();
47 : #endif
48 0 : oops_in_progress = 0;
49 : /*
50 : * OK, the message is on the console. Now we call printk()
51 : * without oops_in_progress set so that printk will give klogd
52 : * a poke. Hold onto your hats...
53 : */
54 0 : console_loglevel = 15; /* NMI oopser may have shut the console up */
55 0 : printk(" ");
56 0 : console_loglevel = loglevel_save;
57 : }
58 :
59 : /*
60 : * Return EIP plus the CS segment base. The segment limit is also
61 : * adjusted, clamped to the kernel/user address space (whichever is
62 : * appropriate), and returned in *eip_limit.
63 : *
64 : * The segment is checked, because it might have been changed by another
65 : * task between the original faulting instruction and here.
66 : *
67 : * If CS is no longer a valid code segment, or if EIP is beyond the
68 : * limit, or if it is a kernel address when CS is not a kernel segment,
69 : * then the returned value will be greater than *eip_limit.
70 : *
71 : * This is slow, but is very rarely executed.
72 : */
73 : static inline unsigned long get_segment_eip(struct pt_regs *regs,
74 : unsigned long *eip_limit)
75 0 : {
76 0 : unsigned long eip = regs->eip;
77 0 : unsigned seg = regs->xcs & 0xffff;
78 0 : u32 seg_ar, seg_limit, base, *desc;
79 :
80 : /* The standard kernel/user address space limit. */
81 0 : *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
82 :
83 : /* Unlikely, but must come before segment checks. */
84 0 : if (unlikely((regs->eflags & VM_MASK) != 0))
85 : return eip + (seg << 4);
86 :
87 : /* By far the most common cases. */
88 0 : if (likely(seg == __USER_CS || seg == __KERNEL_CS))
89 : return eip;
90 :
91 : /* Check the segment exists, is within the current LDT/GDT size,
92 : that kernel/user (ring 0..3) has the appropriate privilege,
93 : that it's a code segment, and get the limit. */
94 0 : __asm__ ("larl %3,%0; lsll %3,%1"
95 : : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
96 0 : if ((~seg_ar & 0x9800) || eip > seg_limit) {
97 0 : *eip_limit = 0;
98 : return 1; /* So that returned eip > *eip_limit. */
99 : }
100 :
101 : /* Get the GDT/LDT descriptor base.
102 : When you look for races in this code remember that
103 : LDT and other horrors are only used in user space. */
104 0 : if (seg & (1<<2)) {
105 : /* Must lock the LDT while reading it. */
106 0 : down(¤t->mm->context.sem);
107 0 : desc = current->mm->context.ldt;
108 0 : desc = (void *)desc + (seg & ~7);
109 : } else {
110 : /* Must disable preemption while reading the GDT. */
111 0 : desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu());
112 0 : desc = (void *)desc + (seg & ~7);
113 : }
114 :
115 : /* Decode the code segment base from the descriptor */
116 0 : base = get_desc_base((unsigned long *)desc);
117 :
118 0 : if (seg & (1<<2)) {
119 0 : up(¤t->mm->context.sem);
120 : } else
121 0 : put_cpu();
122 :
123 : /* Adjust EIP and segment limit, and clamp at the kernel limit.
124 : It's legitimate for segments to wrap at 0xffffffff. */
125 0 : seg_limit += base;
126 0 : if (seg_limit < *eip_limit && seg_limit >= base)
127 0 : *eip_limit = seg_limit;
128 : return eip + base;
129 : }
130 :
131 : /*
132 : * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
133 : * Check that here and ignore it.
134 : */
135 : static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
136 0 : {
137 0 : unsigned long limit;
138 0 : unsigned long instr = get_segment_eip (regs, &limit);
139 0 : int scan_more = 1;
140 0 : int prefetch = 0;
141 0 : int i;
142 :
143 0 : for (i = 0; scan_more && i < 15; i++) {
144 0 : unsigned char opcode;
145 0 : unsigned char instr_hi;
146 0 : unsigned char instr_lo;
147 :
148 0 : if (instr > limit)
149 0 : break;
150 0 : if (__get_user(opcode, (unsigned char __user *) instr))
151 0 : break;
152 :
153 0 : instr_hi = opcode & 0xf0;
154 0 : instr_lo = opcode & 0x0f;
155 0 : instr++;
156 :
157 0 : switch (instr_hi) {
158 : case 0x20:
159 : case 0x30:
160 : /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
161 0 : scan_more = ((instr_lo & 7) == 0x6);
162 0 : break;
163 :
164 : case 0x60:
165 : /* 0x64 thru 0x67 are valid prefixes in all modes. */
166 0 : scan_more = (instr_lo & 0xC) == 0x4;
167 0 : break;
168 : case 0xF0:
169 : /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
170 0 : scan_more = !instr_lo || (instr_lo>>1) == 1;
171 0 : break;
172 : case 0x00:
173 : /* Prefetch instruction is 0x0F0D or 0x0F18 */
174 0 : scan_more = 0;
175 0 : if (instr > limit)
176 0 : break;
177 0 : if (__get_user(opcode, (unsigned char __user *) instr))
178 0 : break;
179 0 : prefetch = (instr_lo == 0xF) &&
180 : (opcode == 0x0D || opcode == 0x18);
181 0 : break;
182 : default:
183 0 : scan_more = 0;
184 0 : break;
185 : }
186 : }
187 0 : return prefetch;
188 : }
189 :
190 : static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
191 : unsigned long error_code)
192 0 : {
193 0 : if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
194 : boot_cpu_data.x86 >= 6)) {
195 : /* Catch an obscure case of prefetch inside an NX page. */
196 0 : if (nx_enabled && (error_code & 16))
197 : return 0;
198 : return __is_prefetch(regs, addr);
199 0 : }
200 : return 0;
201 : }
202 :
203 : static noinline void force_sig_info_fault(int si_signo, int si_code,
204 : unsigned long address, struct task_struct *tsk)
205 0 : {
206 0 : siginfo_t info;
207 :
208 0 : info.si_signo = si_signo;
209 0 : info.si_errno = 0;
210 0 : info.si_code = si_code;
211 0 : info.si_addr = (void __user *)address;
212 0 : force_sig_info(si_signo, &info, tsk);
213 : }
214 :
215 : fastcall void do_invalid_op(struct pt_regs *, unsigned long);
216 :
217 : /*
218 : * This routine handles page faults. It determines the address,
219 : * and the problem, and then passes it off to one of the appropriate
220 : * routines.
221 : *
222 : * error_code:
223 : * bit 0 == 0 means no page found, 1 means protection fault
224 : * bit 1 == 0 means read, 1 means write
225 : * bit 2 == 0 means kernel, 1 means user-mode
226 : */
227 : fastcall void __kprobes do_page_fault(struct pt_regs *regs,
228 : unsigned long error_code)
229 42006 : {
230 42006 : struct task_struct *tsk;
231 42006 : struct mm_struct *mm;
232 42006 : struct vm_area_struct * vma;
233 42006 : unsigned long address;
234 42006 : unsigned long page;
235 42006 : int write, si_code;
236 :
237 : /* get the address */
238 42006 : address = read_cr2();
239 :
240 42006 : if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
241 : SIGSEGV) == NOTIFY_STOP)
242 42006 : return;
243 : /* It's safe to allow irq's after cr2 has been saved */
244 42006 : if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
245 42006 : local_irq_enable();
246 :
247 42006 : tsk = current;
248 :
249 42006 : si_code = SEGV_MAPERR;
250 :
251 : /*
252 : * We fault-in kernel-space virtual memory on-demand. The
253 : * 'reference' page table is init_mm.pgd.
254 : *
255 : * NOTE! We MUST NOT take any locks for this case. We may
256 : * be in an interrupt or a critical region, and should
257 : * only copy the information from the master page table,
258 : * nothing more.
259 : *
260 : * This verifies that the fault happens in kernel space
261 : * (error_code & 4) == 0, and that the fault was not a
262 : * protection error (error_code & 1) == 0.
263 : */
264 42006 : if (unlikely(address >= TASK_SIZE)) {
265 1 : if (!(error_code & 5))
266 0 : goto vmalloc_fault;
267 : /*
268 : * Don't take the mm semaphore here. If we fixup a prefetch
269 : * fault we could otherwise deadlock.
270 : */
271 42005 : goto bad_area_nosemaphore;
272 : }
273 :
274 42005 : mm = tsk->mm;
275 :
276 : /*
277 : * If we're in an interrupt, have no user context or are running in an
278 : * atomic region then we must not take the fault..
279 : */
280 42005 : if (in_atomic() || !mm)
281 42005 : goto bad_area_nosemaphore;
282 :
283 : /* When running in the kernel we expect faults to occur only to
284 : * addresses in user space. All other faults represent errors in the
285 : * kernel and should generate an OOPS. Unfortunatly, in the case of an
286 : * erroneous fault occuring in a code path which already holds mmap_sem
287 : * we will deadlock attempting to validate the fault against the
288 : * address space. Luckily the kernel only validly references user
289 : * space from well defined areas of code, which are listed in the
290 : * exceptions table.
291 : *
292 : * As the vast majority of faults will be valid we will only perform
293 : * the source reference check when there is a possibilty of a deadlock.
294 : * Attempt to lock the address space, if we cannot we then validate the
295 : * source. If this is invalid we can skip the address space check,
296 : * thus avoiding the deadlock.
297 : */
298 42005 : if (!down_read_trylock(&mm->mmap_sem)) {
299 0 : if ((error_code & 4) == 0 &&
300 : !search_exception_tables(regs->eip))
301 0 : goto bad_area_nosemaphore;
302 0 : down_read(&mm->mmap_sem);
303 : }
304 :
305 42005 : vma = find_vma(mm, address);
306 42005 : if (!vma)
307 42005 : goto bad_area;
308 42005 : if (vma->vm_start <= address)
309 3 : goto good_area;
310 3 : if (!(vma->vm_flags & VM_GROWSDOWN))
311 3 : goto bad_area;
312 3 : if (error_code & 4) {
313 : /*
314 : * accessing the stack below %esp is always a bug.
315 : * The "+ 32" is there due to some instructions (like
316 : * pusha) doing post-decrement on the stack and that
317 : * doesn't show up until later..
318 : */
319 3 : if (address + 32 < regs->esp)
320 3 : goto bad_area;
321 : }
322 3 : if (expand_stack(vma, address))
323 42005 : goto bad_area;
324 : /*
325 : * Ok, we have a good vm_area for this memory access, so
326 : * we can handle it..
327 : */
328 : good_area:
329 42005 : si_code = SEGV_ACCERR;
330 42005 : write = 0;
331 42005 : switch (error_code & 3) {
332 : default: /* 3: write, present */
333 : #ifdef TEST_VERIFY_AREA
334 : if (regs->cs == KERNEL_CS)
335 : printk("WP fault at %08lx\n", regs->eip);
336 : #endif
337 : /* fall through */
338 : case 2: /* write, not present */
339 162 : if (!(vma->vm_flags & VM_WRITE))
340 162 : goto bad_area;
341 162 : write++;
342 162 : break;
343 : case 1: /* read, present */
344 41843 : goto bad_area;
345 : case 0: /* read, not present */
346 41843 : if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
347 42005 : goto bad_area;
348 : }
349 :
350 : survive:
351 : /*
352 : * If for any reason at all we couldn't handle the fault,
353 : * make sure we exit gracefully rather than endlessly redo
354 : * the fault.
355 : */
356 42005 : switch (handle_mm_fault(mm, vma, address, write)) {
357 : case VM_FAULT_MINOR:
358 41778 : tsk->min_flt++;
359 41778 : break;
360 : case VM_FAULT_MAJOR:
361 227 : tsk->maj_flt++;
362 227 : break;
363 : case VM_FAULT_SIGBUS:
364 0 : goto do_sigbus;
365 : case VM_FAULT_OOM:
366 0 : goto out_of_memory;
367 : default:
368 0 : BUG();
369 : }
370 :
371 : /*
372 : * Did it hit the DOS screen memory VA from vm86 mode?
373 : */
374 42005 : if (regs->eflags & VM_MASK) {
375 0 : unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
376 0 : if (bit < 32)
377 0 : tsk->thread.screen_bitmap |= 1 << bit;
378 : }
379 42005 : up_read(&mm->mmap_sem);
380 42005 : return;
381 :
382 : /*
383 : * Something tried to access memory that isn't in our memory map..
384 : * Fix it, but check if it's kernel or user first..
385 : */
386 : bad_area:
387 0 : up_read(&mm->mmap_sem);
388 :
389 : bad_area_nosemaphore:
390 : /* User mode accesses just cause a SIGSEGV */
391 1 : if (error_code & 4) {
392 : /*
393 : * Valid to do another page fault here because this one came
394 : * from user space.
395 : */
396 0 : if (is_prefetch(regs, address, error_code))
397 0 : return;
398 :
399 0 : tsk->thread.cr2 = address;
400 : /* Kernel addresses are always protection faults */
401 0 : tsk->thread.error_code = error_code | (address >= TASK_SIZE);
402 0 : tsk->thread.trap_no = 14;
403 0 : force_sig_info_fault(SIGSEGV, si_code, address, tsk);
404 0 : return;
405 : }
406 :
407 : #ifdef CONFIG_X86_F00F_BUG
408 : /*
409 : * Pentium F0 0F C7 C8 bug workaround.
410 : */
411 : if (boot_cpu_data.f00f_bug) {
412 : unsigned long nr;
413 :
414 : nr = (address - idt_descr.address) >> 3;
415 :
416 : if (nr == 6) {
417 : do_invalid_op(regs, 0);
418 : return;
419 : }
420 : }
421 : #endif
422 :
423 : no_context:
424 : /* Are we prepared to handle this kernel fault? */
425 1 : if (fixup_exception(regs))
426 0 : return;
427 :
428 : /*
429 : * Valid to do another page fault here, because if this fault
430 : * had been triggered by is_prefetch fixup_exception would have
431 : * handled it.
432 : */
433 0 : if (is_prefetch(regs, address, error_code))
434 0 : return;
435 :
436 : /*
437 : * Oops. The kernel tried to access some bad page. We'll have to
438 : * terminate things with extreme prejudice.
439 : */
440 :
441 0 : bust_spinlocks(1);
442 :
443 : #ifdef CONFIG_X86_PAE
444 : if (error_code & 16) {
445 : pte_t *pte = lookup_address(address);
446 :
447 : if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
448 : printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
449 : }
450 : #endif
451 0 : if (address < PAGE_SIZE)
452 0 : printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
453 : else
454 0 : printk(KERN_ALERT "Unable to handle kernel paging request");
455 0 : printk(" at virtual address %08lx\n",address);
456 0 : printk(KERN_ALERT " printing eip:\n");
457 0 : printk("%08lx\n", regs->eip);
458 0 : page = read_cr3();
459 0 : page = ((unsigned long *) __va(page))[address >> 22];
460 0 : printk(KERN_ALERT "*pde = %08lx\n", page);
461 : /*
462 : * We must not directly access the pte in the highpte
463 : * case, the page table might be allocated in highmem.
464 : * And lets rather not kmap-atomic the pte, just in case
465 : * it's allocated already.
466 : */
467 : #ifndef CONFIG_HIGHPTE
468 0 : if (page & 1) {
469 0 : page &= PAGE_MASK;
470 0 : address &= 0x003ff000;
471 0 : page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
472 0 : printk(KERN_ALERT "*pte = %08lx\n", page);
473 : }
474 : #endif
475 0 : tsk->thread.cr2 = address;
476 0 : tsk->thread.trap_no = 14;
477 0 : tsk->thread.error_code = error_code;
478 0 : die("Oops", regs, error_code);
479 0 : bust_spinlocks(0);
480 0 : do_exit(SIGKILL);
481 :
482 : /*
483 : * We ran out of memory, or some other thing happened to us that made
484 : * us unable to handle the page fault gracefully.
485 : */
486 : out_of_memory:
487 0 : up_read(&mm->mmap_sem);
488 0 : if (tsk->pid == 1) {
489 0 : yield();
490 0 : down_read(&mm->mmap_sem);
491 0 : goto survive;
492 : }
493 0 : printk("VM: killing process %s\n", tsk->comm);
494 0 : if (error_code & 4)
495 0 : do_exit(SIGKILL);
496 0 : goto no_context;
497 :
498 : do_sigbus:
499 0 : up_read(&mm->mmap_sem);
500 :
501 : /* Kernel mode? Handle exceptions or die */
502 0 : if (!(error_code & 4))
503 0 : goto no_context;
504 :
505 : /* User space => ok to do another page fault */
506 0 : if (is_prefetch(regs, address, error_code))
507 0 : return;
508 :
509 0 : tsk->thread.cr2 = address;
510 0 : tsk->thread.error_code = error_code;
511 0 : tsk->thread.trap_no = 14;
512 0 : force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
513 0 : return;
514 :
515 : vmalloc_fault:
516 : {
517 : /*
518 : * Synchronize this task's top level page-table
519 : * with the 'reference' page table.
520 : *
521 : * Do _not_ use "tsk" here. We might be inside
522 : * an interrupt in the middle of a task switch..
523 : */
524 0 : int index = pgd_index(address);
525 0 : unsigned long pgd_paddr;
526 0 : pgd_t *pgd, *pgd_k;
527 0 : pud_t *pud, *pud_k;
528 0 : pmd_t *pmd, *pmd_k;
529 0 : pte_t *pte_k;
530 :
531 0 : pgd_paddr = read_cr3();
532 0 : pgd = index + (pgd_t *)__va(pgd_paddr);
533 0 : pgd_k = init_mm.pgd + index;
534 :
535 0 : if (!pgd_present(*pgd_k))
536 0 : goto no_context;
537 :
538 : /*
539 : * set_pgd(pgd, *pgd_k); here would be useless on PAE
540 : * and redundant with the set_pmd() on non-PAE. As would
541 : * set_pud.
542 : */
543 :
544 0 : pud = pud_offset(pgd, address);
545 0 : pud_k = pud_offset(pgd_k, address);
546 0 : if (!pud_present(*pud_k))
547 0 : goto no_context;
548 :
549 0 : pmd = pmd_offset(pud, address);
550 0 : pmd_k = pmd_offset(pud_k, address);
551 0 : if (!pmd_present(*pmd_k))
552 0 : goto no_context;
553 0 : set_pmd(pmd, *pmd_k);
554 :
555 0 : pte_k = pte_offset_kernel(pmd_k, address);
556 0 : if (!pte_present(*pte_k))
557 42006 : goto no_context;
558 42006 : return;
559 : }
560 : }
|