diff --git a/Makefile b/Makefile index 44960184701a..1523557bd61f 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 20 +SUBLEVEL = 21 EXTRAVERSION = NAME = Roaring Lionus diff --git a/arch/arm/boot/dts/bcm5301x.dtsi b/arch/arm/boot/dts/bcm5301x.dtsi index ae4b3880616d..4616452ce74d 100644 --- a/arch/arm/boot/dts/bcm5301x.dtsi +++ b/arch/arm/boot/dts/bcm5301x.dtsi @@ -66,14 +66,14 @@ timer@20200 { compatible = "arm,cortex-a9-global-timer"; reg = <0x20200 0x100>; - interrupts = ; + interrupts = ; clocks = <&periph_clk>; }; local-timer@20600 { compatible = "arm,cortex-a9-twd-timer"; reg = <0x20600 0x100>; - interrupts = ; + interrupts = ; clocks = <&periph_clk>; }; diff --git a/arch/arm/mach-bcm/bcm_5301x.c b/arch/arm/mach-bcm/bcm_5301x.c index c8830a2b0d60..fe067f6cebb6 100644 --- a/arch/arm/mach-bcm/bcm_5301x.c +++ b/arch/arm/mach-bcm/bcm_5301x.c @@ -9,14 +9,42 @@ #include #include +#include +#include + +#define FSR_EXTERNAL (1 << 12) +#define FSR_READ (0 << 10) +#define FSR_IMPRECISE 0x0406 static const char *const bcm5301x_dt_compat[] __initconst = { "brcm,bcm4708", NULL, }; +static int bcm5301x_abort_handler(unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +{ + /* + * We want to ignore aborts forwarded from the PCIe bus that are + * expected and shouldn't really be passed by the PCIe controller. + * The biggest disadvantage is the same FSR code may be reported when + * reading non-existing APB register and we shouldn't ignore that. + */ + if (fsr == (FSR_EXTERNAL | FSR_READ | FSR_IMPRECISE)) + return 0; + + return 1; +} + +static void __init bcm5301x_init_early(void) +{ + hook_fault_code(16 + 6, bcm5301x_abort_handler, SIGBUS, BUS_OBJERR, + "imprecise external abort"); +} + DT_MACHINE_START(BCM5301X, "BCM5301X") .l2c_aux_val = 0, .l2c_aux_mask = ~0, .dt_compat = bcm5301x_dt_compat, + .init_early = bcm5301x_init_early, MACHINE_END diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c index 8ac0e5994ed2..0ddf3698b85d 100644 --- a/arch/mips/lantiq/irq.c +++ b/arch/mips/lantiq/irq.c @@ -269,6 +269,11 @@ static void ltq_hw5_irqdispatch(void) DEFINE_HWx_IRQDISPATCH(5) #endif +static void ltq_hw_irq_handler(struct irq_desc *desc) +{ + ltq_hw_irqdispatch(irq_desc_get_irq(desc) - 2); +} + #ifdef CONFIG_MIPS_MT_SMP void __init arch_init_ipiirq(int irq, struct irqaction *action) { @@ -313,23 +318,19 @@ static struct irqaction irq_call = { asmlinkage void plat_irq_dispatch(void) { unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; - unsigned int i; - - if ((MIPS_CPU_TIMER_IRQ == 7) && (pending & CAUSEF_IP7)) { - do_IRQ(MIPS_CPU_TIMER_IRQ); - goto out; - } else { - for (i = 0; i < MAX_IM; i++) { - if (pending & (CAUSEF_IP2 << i)) { - ltq_hw_irqdispatch(i); - goto out; - } - } + int irq; + + if (!pending) { + spurious_interrupt(); + return; } - pr_alert("Spurious IRQ: CAUSE=0x%08x\n", read_c0_status()); -out: - return; + pending >>= CAUSEB_IP; + while (pending) { + irq = fls(pending) - 1; + do_IRQ(MIPS_CPU_IRQ_BASE + irq); + pending &= ~BIT(irq); + } } static int icu_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) @@ -354,11 +355,6 @@ static const struct irq_domain_ops irq_domain_ops = { .map = icu_map, }; -static struct irqaction cascade = { - .handler = no_action, - .name = "cascade", -}; - int __init icu_of_init(struct device_node *node, struct device_node *parent) { struct device_node *eiu_node; @@ -390,7 +386,7 @@ int __init icu_of_init(struct device_node *node, struct device_node *parent) mips_cpu_irq_init(); for (i = 0; i < MAX_IM; i++) - setup_irq(i + 2, &cascade); + irq_set_chained_handler(i + 2, ltq_hw_irq_handler); if (cpu_has_vint) { pr_info("Setting up vectored interrupts\n"); diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 9a2aee1b90fc..7fcf5128996a 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -68,6 +68,15 @@ struct exception_table_entry { ".previous\n" /* + * ASM_EXCEPTIONTABLE_ENTRY_EFAULT() creates a special exception table entry + * (with lowest bit set) for which the fault handler in fixup_exception() will + * load -EFAULT into %r8 for a read or write fault, and zeroes the target + * register in case of a read fault in get_user(). + */ +#define ASM_EXCEPTIONTABLE_ENTRY_EFAULT( fault_addr, except_addr )\ + ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr + 1) + +/* * The page fault handler stores, in a per-cpu area, the following information * if a fixup routine is available. */ @@ -94,7 +103,7 @@ struct exception_data { #define __get_user(x, ptr) \ ({ \ register long __gu_err __asm__ ("r8") = 0; \ - register long __gu_val __asm__ ("r9") = 0; \ + register long __gu_val; \ \ load_sr2(); \ switch (sizeof(*(ptr))) { \ @@ -110,22 +119,23 @@ struct exception_data { }) #define __get_user_asm(ldx, ptr) \ - __asm__("\n1:\t" ldx "\t0(%%sr2,%2),%0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_1)\ + __asm__("1: " ldx " 0(%%sr2,%2),%0\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ : "=r"(__gu_val), "=r"(__gu_err) \ - : "r"(ptr), "1"(__gu_err) \ - : "r1"); + : "r"(ptr), "1"(__gu_err)); #if !defined(CONFIG_64BIT) #define __get_user_asm64(ptr) \ - __asm__("\n1:\tldw 0(%%sr2,%2),%0" \ - "\n2:\tldw 4(%%sr2,%2),%R0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_2)\ - ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_get_user_skip_1)\ + __asm__(" copy %%r0,%R0\n" \ + "1: ldw 0(%%sr2,%2),%0\n" \ + "2: ldw 4(%%sr2,%2),%R0\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ : "=r"(__gu_val), "=r"(__gu_err) \ - : "r"(ptr), "1"(__gu_err) \ - : "r1"); + : "r"(ptr), "1"(__gu_err)); #endif /* !defined(CONFIG_64BIT) */ @@ -151,32 +161,31 @@ struct exception_data { * The "__put_user/kernel_asm()" macros tell gcc they read from memory * instead of writing. This is because they do not write to any memory * gcc knows about, so there are no aliasing issues. These macros must - * also be aware that "fixup_put_user_skip_[12]" are executed in the - * context of the fault, and any registers used there must be listed - * as clobbers. In this case only "r1" is used by the current routines. - * r8/r9 are already listed as err/val. + * also be aware that fixups are executed in the context of the fault, + * and any registers used there must be listed as clobbers. + * r8 is already listed as err. */ #define __put_user_asm(stx, x, ptr) \ __asm__ __volatile__ ( \ - "\n1:\t" stx "\t%2,0(%%sr2,%1)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_1)\ + "1: " stx " %2,0(%%sr2,%1)\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ : "=r"(__pu_err) \ - : "r"(ptr), "r"(x), "0"(__pu_err) \ - : "r1") + : "r"(ptr), "r"(x), "0"(__pu_err)) #if !defined(CONFIG_64BIT) #define __put_user_asm64(__val, ptr) do { \ __asm__ __volatile__ ( \ - "\n1:\tstw %2,0(%%sr2,%1)" \ - "\n2:\tstw %R2,4(%%sr2,%1)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_2)\ - ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_put_user_skip_1)\ + "1: stw %2,0(%%sr2,%1)\n" \ + "2: stw %R2,4(%%sr2,%1)\n" \ + "9:\n" \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \ + ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \ : "=r"(__pu_err) \ - : "r"(ptr), "r"(__val), "0"(__pu_err) \ - : "r1"); \ + : "r"(ptr), "r"(__val), "0"(__pu_err)); \ } while (0) #endif /* !defined(CONFIG_64BIT) */ diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index 3cad8aadc69e..4e6f0d93154f 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -47,16 +47,6 @@ EXPORT_SYMBOL(__cmpxchg_u64); EXPORT_SYMBOL(lclear_user); EXPORT_SYMBOL(lstrnlen_user); -/* Global fixups - defined as int to avoid creation of function pointers */ -extern int fixup_get_user_skip_1; -extern int fixup_get_user_skip_2; -extern int fixup_put_user_skip_1; -extern int fixup_put_user_skip_2; -EXPORT_SYMBOL(fixup_get_user_skip_1); -EXPORT_SYMBOL(fixup_get_user_skip_2); -EXPORT_SYMBOL(fixup_put_user_skip_1); -EXPORT_SYMBOL(fixup_put_user_skip_2); - #ifndef CONFIG_64BIT /* Needed so insmod can set dp value */ extern int $global$; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index e81afc378850..e7ffde2758fc 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -140,6 +140,8 @@ void machine_power_off(void) printk(KERN_EMERG "System shut down completed.\n" "Please power this system off now."); + /* prevent soft lockup/stalled CPU messages for endless loop. */ + rcu_sysrq_start(); for (;;); } diff --git a/arch/parisc/lib/Makefile b/arch/parisc/lib/Makefile index 8fa92b8d839a..f2dac4d73b1b 100644 --- a/arch/parisc/lib/Makefile +++ b/arch/parisc/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for parisc-specific library files # -lib-y := lusercopy.o bitops.o checksum.o io.o memset.o fixup.o memcpy.o \ +lib-y := lusercopy.o bitops.o checksum.o io.o memset.o memcpy.o \ ucmpdi2.o delay.o obj-y := iomap.o diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S deleted file mode 100644 index a5b72f22c7a6..000000000000 --- a/arch/parisc/lib/fixup.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Linux/PA-RISC Project (http://www.parisc-linux.org/) - * - * Copyright (C) 2004 Randolph Chung - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Fixup routines for kernel exception handling. - */ -#include -#include -#include -#include - -#ifdef CONFIG_SMP - .macro get_fault_ip t1 t2 - loadgp - addil LT%__per_cpu_offset,%r27 - LDREG RT%__per_cpu_offset(%r1),\t1 - /* t2 = smp_processor_id() */ - mfctl 30,\t2 - ldw TI_CPU(\t2),\t2 -#ifdef CONFIG_64BIT - extrd,u \t2,63,32,\t2 -#endif - /* t2 = &__per_cpu_offset[smp_processor_id()]; */ - LDREGX \t2(\t1),\t2 - addil LT%exception_data,%r27 - LDREG RT%exception_data(%r1),\t1 - /* t1 = this_cpu_ptr(&exception_data) */ - add,l \t1,\t2,\t1 - /* %r27 = t1->fault_gp - restore gp */ - LDREG EXCDATA_GP(\t1), %r27 - /* t1 = t1->fault_ip */ - LDREG EXCDATA_IP(\t1), \t1 - .endm -#else - .macro get_fault_ip t1 t2 - loadgp - /* t1 = this_cpu_ptr(&exception_data) */ - addil LT%exception_data,%r27 - LDREG RT%exception_data(%r1),\t2 - /* %r27 = t2->fault_gp - restore gp */ - LDREG EXCDATA_GP(\t2), %r27 - /* t1 = t2->fault_ip */ - LDREG EXCDATA_IP(\t2), \t1 - .endm -#endif - - .level LEVEL - - .text - .section .fixup, "ax" - - /* get_user() fixups, store -EFAULT in r8, and 0 in r9 */ -ENTRY_CFI(fixup_get_user_skip_1) - get_fault_ip %r1,%r8 - ldo 4(%r1), %r1 - ldi -EFAULT, %r8 - bv %r0(%r1) - copy %r0, %r9 -ENDPROC_CFI(fixup_get_user_skip_1) - -ENTRY_CFI(fixup_get_user_skip_2) - get_fault_ip %r1,%r8 - ldo 8(%r1), %r1 - ldi -EFAULT, %r8 - bv %r0(%r1) - copy %r0, %r9 -ENDPROC_CFI(fixup_get_user_skip_2) - - /* put_user() fixups, store -EFAULT in r8 */ -ENTRY_CFI(fixup_put_user_skip_1) - get_fault_ip %r1,%r8 - ldo 4(%r1), %r1 - bv %r0(%r1) - ldi -EFAULT, %r8 -ENDPROC_CFI(fixup_put_user_skip_1) - -ENTRY_CFI(fixup_put_user_skip_2) - get_fault_ip %r1,%r8 - ldo 8(%r1), %r1 - bv %r0(%r1) - ldi -EFAULT, %r8 -ENDPROC_CFI(fixup_put_user_skip_2) - diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S index 56845de6b5df..f01188c044ee 100644 --- a/arch/parisc/lib/lusercopy.S +++ b/arch/parisc/lib/lusercopy.S @@ -5,6 +5,8 @@ * Copyright (C) 2000 Richard Hirst * Copyright (C) 2001 Matthieu Delahaye * Copyright (C) 2003 Randolph Chung + * Copyright (C) 2017 Helge Deller + * Copyright (C) 2017 John David Anglin * * * This program is free software; you can redistribute it and/or modify @@ -132,4 +134,320 @@ ENDPROC_CFI(lstrnlen_user) .procend + + +/* + * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) + * + * Inputs: + * - sr1 already contains space of source region + * - sr2 already contains space of destination region + * + * Returns: + * - number of bytes that could not be copied. + * On success, this will be zero. + * + * This code is based on a C-implementation of a copy routine written by + * Randolph Chung, which in turn was derived from the glibc. + * + * Several strategies are tried to try to get the best performance for various + * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes + * at a time using general registers. Unaligned copies are handled either by + * aligning the destination and then using shift-and-write method, or in a few + * cases by falling back to a byte-at-a-time copy. + * + * Testing with various alignments and buffer sizes shows that this code is + * often >10x faster than a simple byte-at-a-time copy, even for strangely + * aligned operands. It is interesting to note that the glibc version of memcpy + * (written in C) is actually quite fast already. This routine is able to beat + * it by 30-40% for aligned copies because of the loop unrolling, but in some + * cases the glibc version is still slightly faster. This lends more + * credibility that gcc can generate very good code as long as we are careful. + * + * Possible optimizations: + * - add cache prefetching + * - try not to use the post-increment address modifiers; they may create + * additional interlocks. Assumption is that those were only efficient on old + * machines (pre PA8000 processors) + */ + + dst = arg0 + src = arg1 + len = arg2 + end = arg3 + t1 = r19 + t2 = r20 + t3 = r21 + t4 = r22 + srcspc = sr1 + dstspc = sr2 + + t0 = r1 + a1 = t1 + a2 = t2 + a3 = t3 + a0 = t4 + + save_src = ret0 + save_dst = ret1 + save_len = r31 + +ENTRY_CFI(pa_memcpy) + .proc + .callinfo NO_CALLS + .entry + + /* Last destination address */ + add dst,len,end + + /* short copy with less than 16 bytes? */ + cmpib,>>=,n 15,len,.Lbyte_loop + + /* same alignment? */ + xor src,dst,t0 + extru t0,31,2,t1 + cmpib,<>,n 0,t1,.Lunaligned_copy + +#ifdef CONFIG_64BIT + /* only do 64-bit copies if we can get aligned. */ + extru t0,31,3,t1 + cmpib,<>,n 0,t1,.Lalign_loop32 + + /* loop until we are 64-bit aligned */ +.Lalign_loop64: + extru dst,31,3,t1 + cmpib,=,n 0,t1,.Lcopy_loop_16 +20: ldb,ma 1(srcspc,src),t1 +21: stb,ma t1,1(dstspc,dst) + b .Lalign_loop64 + ldo -1(len),len + + ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) + + ldi 31,t0 +.Lcopy_loop_16: + cmpb,COND(>>=),n t0,len,.Lword_loop + +10: ldd 0(srcspc,src),t1 +11: ldd 8(srcspc,src),t2 + ldo 16(src),src +12: std,ma t1,8(dstspc,dst) +13: std,ma t2,8(dstspc,dst) +14: ldd 0(srcspc,src),t1 +15: ldd 8(srcspc,src),t2 + ldo 16(src),src +16: std,ma t1,8(dstspc,dst) +17: std,ma t2,8(dstspc,dst) + + ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault) + ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault) + ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) + + b .Lcopy_loop_16 + ldo -32(len),len + +.Lword_loop: + cmpib,COND(>>=),n 3,len,.Lbyte_loop +20: ldw,ma 4(srcspc,src),t1 +21: stw,ma t1,4(dstspc,dst) + b .Lword_loop + ldo -4(len),len + + ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) + +#endif /* CONFIG_64BIT */ + + /* loop until we are 32-bit aligned */ +.Lalign_loop32: + extru dst,31,2,t1 + cmpib,=,n 0,t1,.Lcopy_loop_4 +20: ldb,ma 1(srcspc,src),t1 +21: stb,ma t1,1(dstspc,dst) + b .Lalign_loop32 + ldo -1(len),len + + ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) + + +.Lcopy_loop_4: + cmpib,COND(>>=),n 15,len,.Lbyte_loop + +10: ldw 0(srcspc,src),t1 +11: ldw 4(srcspc,src),t2 +12: stw,ma t1,4(dstspc,dst) +13: stw,ma t2,4(dstspc,dst) +14: ldw 8(srcspc,src),t1 +15: ldw 12(srcspc,src),t2 + ldo 16(src),src +16: stw,ma t1,4(dstspc,dst) +17: stw,ma t2,4(dstspc,dst) + + ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault) + ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault) + ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) + + b .Lcopy_loop_4 + ldo -16(len),len + +.Lbyte_loop: + cmpclr,COND(<>) len,%r0,%r0 + b,n .Lcopy_done +20: ldb 0(srcspc,src),t1 + ldo 1(src),src +21: stb,ma t1,1(dstspc,dst) + b .Lbyte_loop + ldo -1(len),len + + ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) + +.Lcopy_done: + bv %r0(%r2) + sub end,dst,ret0 + + + /* src and dst are not aligned the same way. */ + /* need to go the hard way */ +.Lunaligned_copy: + /* align until dst is 32bit-word-aligned */ + extru dst,31,2,t1 + cmpib,COND(=),n 0,t1,.Lcopy_dstaligned +20: ldb 0(srcspc,src),t1 + ldo 1(src),src +21: stb,ma t1,1(dstspc,dst) + b .Lunaligned_copy + ldo -1(len),len + + ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) + ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) + +.Lcopy_dstaligned: + + /* store src, dst and len in safe place */ + copy src,save_src + copy dst,save_dst + copy len,save_len + + /* len now needs give number of words to copy */ + SHRREG len,2,len + + /* + * Copy from a not-aligned src to an aligned dst using shifts. + * Handles 4 words per loop. + */ + + depw,z src,28,2,t0 + subi 32,t0,t0 + mtsar t0 + extru len,31,2,t0 + cmpib,= 2,t0,.Lcase2 + /* Make src aligned by rounding it down. */ + depi 0,31,2,src + + cmpiclr,<> 3,t0,%r0 + b,n .Lcase3 + cmpiclr,<> 1,t0,%r0 + b,n .Lcase1 +.Lcase0: + cmpb,= %r0,len,.Lcda_finish + nop + +1: ldw,ma 4(srcspc,src), a3 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) +1: ldw,ma 4(srcspc,src), a0 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + b,n .Ldo3 +.Lcase1: +1: ldw,ma 4(srcspc,src), a2 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) +1: ldw,ma 4(srcspc,src), a3 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + ldo -1(len),len + cmpb,=,n %r0,len,.Ldo0 +.Ldo4: +1: ldw,ma 4(srcspc,src), a0 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + shrpw a2, a3, %sar, t0 +1: stw,ma t0, 4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) +.Ldo3: +1: ldw,ma 4(srcspc,src), a1 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + shrpw a3, a0, %sar, t0 +1: stw,ma t0, 4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) +.Ldo2: +1: ldw,ma 4(srcspc,src), a2 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + shrpw a0, a1, %sar, t0 +1: stw,ma t0, 4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) +.Ldo1: +1: ldw,ma 4(srcspc,src), a3 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + shrpw a1, a2, %sar, t0 +1: stw,ma t0, 4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) + ldo -4(len),len + cmpb,<> %r0,len,.Ldo4 + nop +.Ldo0: + shrpw a2, a3, %sar, t0 +1: stw,ma t0, 4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) + +.Lcda_rdfault: +.Lcda_finish: + /* calculate new src, dst and len and jump to byte-copy loop */ + sub dst,save_dst,t0 + add save_src,t0,src + b .Lbyte_loop + sub save_len,t0,len + +.Lcase3: +1: ldw,ma 4(srcspc,src), a0 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) +1: ldw,ma 4(srcspc,src), a1 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + b .Ldo2 + ldo 1(len),len +.Lcase2: +1: ldw,ma 4(srcspc,src), a1 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) +1: ldw,ma 4(srcspc,src), a2 + ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) + b .Ldo1 + ldo 2(len),len + + + /* fault exception fixup handlers: */ +#ifdef CONFIG_64BIT +.Lcopy16_fault: +10: b .Lcopy_done + std,ma t1,8(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) +#endif + +.Lcopy8_fault: +10: b .Lcopy_done + stw,ma t1,4(dstspc,dst) + ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) + + .exit +ENDPROC_CFI(pa_memcpy) + .procend + .end diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index f82ff10ed974..b3d47ec1d80a 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -2,7 +2,7 @@ * Optimized memory copy routines. * * Copyright (C) 2004 Randolph Chung - * Copyright (C) 2013 Helge Deller + * Copyright (C) 2013-2017 Helge Deller * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,474 +21,21 @@ * Portions derived from the GNU C Library * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. * - * Several strategies are tried to try to get the best performance for various - * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using - * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using - * general registers. Unaligned copies are handled either by aligning the - * destination and then using shift-and-write method, or in a few cases by - * falling back to a byte-at-a-time copy. - * - * I chose to implement this in C because it is easier to maintain and debug, - * and in my experiments it appears that the C code generated by gcc (3.3/3.4 - * at the time of writing) is fairly optimal. Unfortunately some of the - * semantics of the copy routine (exception handling) is difficult to express - * in C, so we have to play some tricks to get it to work. - * - * All the loads and stores are done via explicit asm() code in order to use - * the right space registers. - * - * Testing with various alignments and buffer sizes shows that this code is - * often >10x faster than a simple byte-at-a-time copy, even for strangely - * aligned operands. It is interesting to note that the glibc version - * of memcpy (written in C) is actually quite fast already. This routine is - * able to beat it by 30-40% for aligned copies because of the loop unrolling, - * but in some cases the glibc version is still slightly faster. This lends - * more credibility that gcc can generate very good code as long as we are - * careful. - * - * TODO: - * - cache prefetching needs more experimentation to get optimal settings - * - try not to use the post-increment address modifiers; they create additional - * interlocks - * - replace byte-copy loops with stybs sequences */ -#ifdef __KERNEL__ #include #include #include -#define s_space "%%sr1" -#define d_space "%%sr2" -#else -#include "memcpy.h" -#define s_space "%%sr0" -#define d_space "%%sr0" -#define pa_memcpy new2_copy -#endif DECLARE_PER_CPU(struct exception_data, exception_data); -#define preserve_branch(label) do { \ - volatile int dummy = 0; \ - /* The following branch is never taken, it's just here to */ \ - /* prevent gcc from optimizing away our exception code. */ \ - if (unlikely(dummy != dummy)) \ - goto label; \ -} while (0) - #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) #define get_kernel_space() (0) -#define MERGE(w0, sh_1, w1, sh_2) ({ \ - unsigned int _r; \ - asm volatile ( \ - "mtsar %3\n" \ - "shrpw %1, %2, %%sar, %0\n" \ - : "=r"(_r) \ - : "r"(w0), "r"(w1), "r"(sh_2) \ - ); \ - _r; \ -}) -#define THRESHOLD 16 - -#ifdef DEBUG_MEMCPY -#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) -#else -#define DPRINTF(fmt, args...) -#endif - -#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : _tt(_t), "+r"(_a) \ - : \ - : "r8") - -#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : "+r"(_a) \ - : _tt(_t) \ - : "r8") - -#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) -#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) -#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) -#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) -#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) -#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) - -#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : _tt(_t) \ - : "r"(_a) \ - : "r8") - -#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : \ - : _tt(_t), "r"(_a) \ - : "r8") - -#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) -#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) - -#ifdef CONFIG_PREFETCH -static inline void prefetch_src(const void *addr) -{ - __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); -} - -static inline void prefetch_dst(const void *addr) -{ - __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); -} -#else -#define prefetch_src(addr) do { } while(0) -#define prefetch_dst(addr) do { } while(0) -#endif - -#define PA_MEMCPY_OK 0 -#define PA_MEMCPY_LOAD_ERROR 1 -#define PA_MEMCPY_STORE_ERROR 2 - -/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words - * per loop. This code is derived from glibc. - */ -static noinline unsigned long copy_dstaligned(unsigned long dst, - unsigned long src, unsigned long len) -{ - /* gcc complains that a2 and a3 may be uninitialized, but actually - * they cannot be. Initialize a2/a3 to shut gcc up. - */ - register unsigned int a0, a1, a2 = 0, a3 = 0; - int sh_1, sh_2; - - /* prefetch_src((const void *)src); */ - - /* Calculate how to shift a word read at the memory operation - aligned srcp to make it aligned for copy. */ - sh_1 = 8 * (src % sizeof(unsigned int)); - sh_2 = 8 * sizeof(unsigned int) - sh_1; - - /* Make src aligned by rounding it down. */ - src &= -sizeof(unsigned int); - - switch (len % 4) - { - case 2: - /* a1 = ((unsigned int *) src)[0]; - a2 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a1, cda_ldw_exc); - ldw(s_space, 4, src, a2, cda_ldw_exc); - src -= 1 * sizeof(unsigned int); - dst -= 3 * sizeof(unsigned int); - len += 2; - goto do1; - case 3: - /* a0 = ((unsigned int *) src)[0]; - a1 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a0, cda_ldw_exc); - ldw(s_space, 4, src, a1, cda_ldw_exc); - src -= 0 * sizeof(unsigned int); - dst -= 2 * sizeof(unsigned int); - len += 1; - goto do2; - case 0: - if (len == 0) - return PA_MEMCPY_OK; - /* a3 = ((unsigned int *) src)[0]; - a0 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a3, cda_ldw_exc); - ldw(s_space, 4, src, a0, cda_ldw_exc); - src -=-1 * sizeof(unsigned int); - dst -= 1 * sizeof(unsigned int); - len += 0; - goto do3; - case 1: - /* a2 = ((unsigned int *) src)[0]; - a3 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a2, cda_ldw_exc); - ldw(s_space, 4, src, a3, cda_ldw_exc); - src -=-2 * sizeof(unsigned int); - dst -= 0 * sizeof(unsigned int); - len -= 1; - if (len == 0) - goto do0; - goto do4; /* No-op. */ - } - - do - { - /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ -do4: - /* a0 = ((unsigned int *) src)[0]; */ - ldw(s_space, 0, src, a0, cda_ldw_exc); - /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ - stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); -do3: - /* a1 = ((unsigned int *) src)[1]; */ - ldw(s_space, 4, src, a1, cda_ldw_exc); - /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ - stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); -do2: - /* a2 = ((unsigned int *) src)[2]; */ - ldw(s_space, 8, src, a2, cda_ldw_exc); - /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ - stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); -do1: - /* a3 = ((unsigned int *) src)[3]; */ - ldw(s_space, 12, src, a3, cda_ldw_exc); - /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ - stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); - - src += 4 * sizeof(unsigned int); - dst += 4 * sizeof(unsigned int); - len -= 4; - } - while (len != 0); - -do0: - /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ - stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); - - preserve_branch(handle_load_error); - preserve_branch(handle_store_error); - - return PA_MEMCPY_OK; - -handle_load_error: - __asm__ __volatile__ ("cda_ldw_exc:\n"); - return PA_MEMCPY_LOAD_ERROR; - -handle_store_error: - __asm__ __volatile__ ("cda_stw_exc:\n"); - return PA_MEMCPY_STORE_ERROR; -} - - -/* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR. - * In case of an access fault the faulty address can be read from the per_cpu - * exception data struct. */ -static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp, - unsigned long len) -{ - register unsigned long src, dst, t1, t2, t3; - register unsigned char *pcs, *pcd; - register unsigned int *pws, *pwd; - register double *pds, *pdd; - unsigned long ret; - - src = (unsigned long)srcp; - dst = (unsigned long)dstp; - pcs = (unsigned char *)srcp; - pcd = (unsigned char *)dstp; - - /* prefetch_src((const void *)srcp); */ - - if (len < THRESHOLD) - goto byte_copy; - - /* Check alignment */ - t1 = (src ^ dst); - if (unlikely(t1 & (sizeof(double)-1))) - goto unaligned_copy; - - /* src and dst have same alignment. */ - - /* Copy bytes till we are double-aligned. */ - t2 = src & (sizeof(double) - 1); - if (unlikely(t2 != 0)) { - t2 = sizeof(double) - t2; - while (t2 && len) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - len--; - stbma(d_space, t3, pcd, pmc_store_exc); - t2--; - } - } - - pds = (double *)pcs; - pdd = (double *)pcd; - -#if 0 - /* Copy 8 doubles at a time */ - while (len >= 8*sizeof(double)) { - register double r1, r2, r3, r4, r5, r6, r7, r8; - /* prefetch_src((char *)pds + L1_CACHE_BYTES); */ - flddma(s_space, pds, r1, pmc_load_exc); - flddma(s_space, pds, r2, pmc_load_exc); - flddma(s_space, pds, r3, pmc_load_exc); - flddma(s_space, pds, r4, pmc_load_exc); - fstdma(d_space, r1, pdd, pmc_store_exc); - fstdma(d_space, r2, pdd, pmc_store_exc); - fstdma(d_space, r3, pdd, pmc_store_exc); - fstdma(d_space, r4, pdd, pmc_store_exc); - -#if 0 - if (L1_CACHE_BYTES <= 32) - prefetch_src((char *)pds + L1_CACHE_BYTES); -#endif - flddma(s_space, pds, r5, pmc_load_exc); - flddma(s_space, pds, r6, pmc_load_exc); - flddma(s_space, pds, r7, pmc_load_exc); - flddma(s_space, pds, r8, pmc_load_exc); - fstdma(d_space, r5, pdd, pmc_store_exc); - fstdma(d_space, r6, pdd, pmc_store_exc); - fstdma(d_space, r7, pdd, pmc_store_exc); - fstdma(d_space, r8, pdd, pmc_store_exc); - len -= 8*sizeof(double); - } -#endif - - pws = (unsigned int *)pds; - pwd = (unsigned int *)pdd; - -word_copy: - while (len >= 8*sizeof(unsigned int)) { - register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; - /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ - ldwma(s_space, pws, r1, pmc_load_exc); - ldwma(s_space, pws, r2, pmc_load_exc); - ldwma(s_space, pws, r3, pmc_load_exc); - ldwma(s_space, pws, r4, pmc_load_exc); - stwma(d_space, r1, pwd, pmc_store_exc); - stwma(d_space, r2, pwd, pmc_store_exc); - stwma(d_space, r3, pwd, pmc_store_exc); - stwma(d_space, r4, pwd, pmc_store_exc); - - ldwma(s_space, pws, r5, pmc_load_exc); - ldwma(s_space, pws, r6, pmc_load_exc); - ldwma(s_space, pws, r7, pmc_load_exc); - ldwma(s_space, pws, r8, pmc_load_exc); - stwma(d_space, r5, pwd, pmc_store_exc); - stwma(d_space, r6, pwd, pmc_store_exc); - stwma(d_space, r7, pwd, pmc_store_exc); - stwma(d_space, r8, pwd, pmc_store_exc); - len -= 8*sizeof(unsigned int); - } - - while (len >= 4*sizeof(unsigned int)) { - register unsigned int r1,r2,r3,r4; - ldwma(s_space, pws, r1, pmc_load_exc); - ldwma(s_space, pws, r2, pmc_load_exc); - ldwma(s_space, pws, r3, pmc_load_exc); - ldwma(s_space, pws, r4, pmc_load_exc); - stwma(d_space, r1, pwd, pmc_store_exc); - stwma(d_space, r2, pwd, pmc_store_exc); - stwma(d_space, r3, pwd, pmc_store_exc); - stwma(d_space, r4, pwd, pmc_store_exc); - len -= 4*sizeof(unsigned int); - } - - pcs = (unsigned char *)pws; - pcd = (unsigned char *)pwd; - -byte_copy: - while (len) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - } - - return PA_MEMCPY_OK; - -unaligned_copy: - /* possibly we are aligned on a word, but not on a double... */ - if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) { - t2 = src & (sizeof(unsigned int) - 1); - - if (unlikely(t2 != 0)) { - t2 = sizeof(unsigned int) - t2; - while (t2) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - t2--; - } - } - - pws = (unsigned int *)pcs; - pwd = (unsigned int *)pcd; - goto word_copy; - } - - /* Align the destination. */ - if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { - t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); - while (t2) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - t2--; - } - dst = (unsigned long)pcd; - src = (unsigned long)pcs; - } - - ret = copy_dstaligned(dst, src, len / sizeof(unsigned int)); - if (ret) - return ret; - - pcs += (len & -sizeof(unsigned int)); - pcd += (len & -sizeof(unsigned int)); - len %= sizeof(unsigned int); - - preserve_branch(handle_load_error); - preserve_branch(handle_store_error); - - goto byte_copy; - -handle_load_error: - __asm__ __volatile__ ("pmc_load_exc:\n"); - return PA_MEMCPY_LOAD_ERROR; - -handle_store_error: - __asm__ __volatile__ ("pmc_store_exc:\n"); - return PA_MEMCPY_STORE_ERROR; -} - - /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ -static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) -{ - unsigned long ret, fault_addr, reference; - struct exception_data *d; - - ret = pa_memcpy_internal(dstp, srcp, len); - if (likely(ret == PA_MEMCPY_OK)) - return 0; - - /* if a load or store fault occured we can get the faulty addr */ - d = this_cpu_ptr(&exception_data); - fault_addr = d->fault_addr; - - /* error in load or store? */ - if (ret == PA_MEMCPY_LOAD_ERROR) - reference = (unsigned long) srcp; - else - reference = (unsigned long) dstp; +extern unsigned long pa_memcpy(void *dst, const void *src, + unsigned long len); - DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n", - ret, len, fault_addr, reference); - - if (fault_addr >= reference) - return len - (fault_addr - reference); - else - return len; -} - -#ifdef __KERNEL__ unsigned long __copy_to_user(void __user *dst, const void *src, unsigned long len) { @@ -537,5 +84,3 @@ long probe_kernel_read(void *dst, const void *src, size_t size) return __probe_kernel_read(dst, src, size); } - -#endif diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 1a0b4f63f0e9..040c48fc5391 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -149,6 +149,23 @@ int fixup_exception(struct pt_regs *regs) d->fault_space = regs->isr; d->fault_addr = regs->ior; + /* + * Fix up get_user() and put_user(). + * ASM_EXCEPTIONTABLE_ENTRY_EFAULT() sets the least-significant + * bit in the relative address of the fixup routine to indicate + * that %r8 should be loaded with -EFAULT to report a userspace + * access error. + */ + if (fix->fixup & 1) { + regs->gr[8] = -EFAULT; + + /* zero target register for get_user() */ + if (parisc_acctyp(0, regs->iir) == VM_READ) { + int treg = regs->iir & 0x1f; + regs->gr[treg] = 0; + } + } + regs->iaoq[0] = (unsigned long)&fix->fixup + fix->fixup; regs->iaoq[0] &= ~3; /* diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 779782f58324..9a53a06e5a3e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -290,7 +290,7 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 887e57182716..aed206475aa7 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -48,7 +48,7 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; #if defined(CONFIG_X86_ESPFIX64) static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; #elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_START; +static const unsigned long vaddr_end = EFI_VA_END; #else static const unsigned long vaddr_end = __START_KERNEL_map; #endif @@ -105,7 +105,7 @@ void __init kernel_randomize_memory(void) */ BUILD_BUG_ON(vaddr_start >= vaddr_end); BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_START); + vaddr_end >= EFI_VA_END); BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..9f21b0c5945d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -713,10 +713,9 @@ static void __init xen_reserve_xen_mfnlist(void) size = PFN_PHYS(xen_start_info->nr_p2m_frames); } - if (!xen_is_e820_reserved(start, size)) { - memblock_reserve(start, size); + memblock_reserve(start, size); + if (!xen_is_e820_reserved(start, size)) return; - } #ifdef CONFIG_X86_32 /* @@ -727,6 +726,7 @@ static void __init xen_reserve_xen_mfnlist(void) BUG(); #else xen_relocate_p2m(); + memblock_free(start, size); #endif } diff --git a/block/bio.c b/block/bio.c index db85c5753a76..655c9016052a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -372,10 +372,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs) bio_list_init(&punt); bio_list_init(&nopunt); - while ((bio = bio_list_pop(current->bio_list))) + while ((bio = bio_list_pop(¤t->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[0] = nopunt; - *current->bio_list = nopunt; + bio_list_init(&nopunt); + while ((bio = bio_list_pop(¤t->bio_list[1]))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); @@ -462,7 +466,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * we retry with the original gfp_flags. */ - if (current->bio_list && !bio_list_empty(current->bio_list)) + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); diff --git a/block/blk-core.c b/block/blk-core.c index 14d7c0740dc0..d1f2801ce836 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1994,7 +1994,14 @@ generic_make_request_checks(struct bio *bio) */ blk_qc_t generic_make_request(struct bio *bio) { - struct bio_list bio_list_on_stack; + /* + * bio_list_on_stack[0] contains bios submitted by the current + * make_request_fn. + * bio_list_on_stack[1] contains bios that were submitted before + * the current make_request_fn, but that haven't been processed + * yet. + */ + struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) @@ -2011,7 +2018,7 @@ blk_qc_t generic_make_request(struct bio *bio) * should be added at the tail */ if (current->bio_list) { - bio_list_add(current->bio_list, bio); + bio_list_add(¤t->bio_list[0], bio); goto out; } @@ -2030,23 +2037,39 @@ blk_qc_t generic_make_request(struct bio *bio) * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); - bio_list_init(&bio_list_on_stack); - current->bio_list = &bio_list_on_stack; + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (likely(blk_queue_enter(q, false) == 0)) { + struct bio_list lower, same; + + /* Create a fresh bio_list for all subordinate requests */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); blk_queue_exit(q); - bio = bio_list_pop(current->bio_list); + /* sort new bios into those for a lower level + * and those for the same level + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bdev_get_queue(bio->bi_bdev)) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + /* now assemble so we handle the lowest level first */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { - struct bio *bio_next = bio_list_pop(current->bio_list); - bio_io_error(bio); - bio = bio_next; } + bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); current->bio_list = NULL; /* deactivate */ diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 9ed087853dee..4c5678cfa9c4 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -2,7 +2,6 @@ # Makefile for the Linux ACPI interpreter # -ccflags-y := -Os ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT # diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index b4c1a6a51da4..03250e1f1103 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -25,9 +25,11 @@ ACPI_MODULE_NAME("platform"); static const struct acpi_device_id forbidden_id_list[] = { - {"PNP0000", 0}, /* PIC */ - {"PNP0100", 0}, /* Timer */ - {"PNP0200", 0}, /* AT DMA Controller */ + {"PNP0000", 0}, /* PIC */ + {"PNP0100", 0}, /* Timer */ + {"PNP0200", 0}, /* AT DMA Controller */ + {"ACPI0009", 0}, /* IOxAPIC */ + {"ACPI000A", 0}, /* IOAPIC */ {"", 0}, }; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index b1254f885fed..b87d27859141 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1299,6 +1299,8 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu, goto out_pm_put; } + mutex_lock(&gpu->lock); + fence = etnaviv_gpu_fence_alloc(gpu); if (!fence) { event_free(gpu, event); @@ -1306,8 +1308,6 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu, goto out_pm_put; } - mutex_lock(&gpu->lock); - gpu->event[event].fence = fence; submit->fence = fence->seqno; gpu->active_fence = submit->fence; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 3de5e6e21662..4ce04e06d9ac 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -213,8 +213,8 @@ static void radeon_evict_flags(struct ttm_buffer_object *bo, rbo->placement.num_busy_placement = 0; for (i = 0; i < rbo->placement.num_placement; i++) { if (rbo->placements[i].flags & TTM_PL_FLAG_VRAM) { - if (rbo->placements[0].fpfn < fpfn) - rbo->placements[0].fpfn = fpfn; + if (rbo->placements[i].fpfn < fpfn) + rbo->placements[i].fpfn = fpfn; } else { rbo->placement.busy_placement = &rbo->placements[i]; diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 7aadce1f7e7a..c7e6c9839c9a 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -842,6 +842,17 @@ static void vc4_crtc_destroy_state(struct drm_crtc *crtc, drm_atomic_helper_crtc_destroy_state(crtc, state); } +static void +vc4_crtc_reset(struct drm_crtc *crtc) +{ + if (crtc->state) + __drm_atomic_helper_crtc_destroy_state(crtc->state); + + crtc->state = kzalloc(sizeof(struct vc4_crtc_state), GFP_KERNEL); + if (crtc->state) + crtc->state->crtc = crtc; +} + static const struct drm_crtc_funcs vc4_crtc_funcs = { .set_config = drm_atomic_helper_set_config, .destroy = vc4_crtc_destroy, @@ -849,7 +860,7 @@ static const struct drm_crtc_funcs vc4_crtc_funcs = { .set_property = NULL, .cursor_set = NULL, /* handled by drm_mode_cursor_universal */ .cursor_move = NULL, /* handled by drm_mode_cursor_universal */ - .reset = drm_atomic_helper_crtc_reset, + .reset = vc4_crtc_reset, .atomic_duplicate_state = vc4_crtc_duplicate_state, .atomic_destroy_state = vc4_crtc_destroy_state, .gamma_set = vc4_crtc_gamma_set, diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 5e7a5648e708..0c535d0f3b95 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2017,6 +2017,14 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless) wacom_update_name(wacom, wireless ? " (WL)" : ""); + /* pen only Bamboo neither support touch nor pad */ + if ((features->type == BAMBOO_PEN) && + ((features->device_type & WACOM_DEVICETYPE_TOUCH) || + (features->device_type & WACOM_DEVICETYPE_PAD))) { + error = -ENODEV; + goto fail; + } + error = wacom_add_shared_data(hdev); if (error) goto fail; @@ -2064,14 +2072,6 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless) goto fail_quirks; } - /* pen only Bamboo neither support touch nor pad */ - if ((features->type == BAMBOO_PEN) && - ((features->device_type & WACOM_DEVICETYPE_TOUCH) || - (features->device_type & WACOM_DEVICETYPE_PAD))) { - error = -ENODEV; - goto fail_quirks; - } - if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) error = hid_hw_open(hdev); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 628ba001bb3c..e66f4040d84b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -986,26 +986,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) struct dm_offload *o = container_of(cb, struct dm_offload, cb); struct bio_list list; struct bio *bio; + int i; INIT_LIST_HEAD(&o->cb.list); if (unlikely(!current->bio_list)) return; - list = *current->bio_list; - bio_list_init(current->bio_list); - - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { - bio_list_add(current->bio_list, bio); - continue; + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); } } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 55b5e0e77b17..4c4aab02e311 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -941,7 +941,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 387ae1cbf698..a8b430ff117b 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -29,6 +29,8 @@ #include "sdhci-pltfm.h" +#define SDMMC_MC1R 0x204 +#define SDMMC_MC1R_DDR BIT(3) #define SDMMC_CACR 0x230 #define SDMMC_CACR_CAPWREN BIT(0) #define SDMMC_CACR_KEY (0x46 << 8) @@ -103,11 +105,18 @@ static void sdhci_at91_set_power(struct sdhci_host *host, unsigned char mode, sdhci_set_power_noreg(host, mode, vdd); } +void sdhci_at91_set_uhs_signaling(struct sdhci_host *host, unsigned int timing) +{ + if (timing == MMC_TIMING_MMC_DDR52) + sdhci_writeb(host, SDMMC_MC1R_DDR, SDMMC_MC1R); + sdhci_set_uhs_signaling(host, timing); +} + static const struct sdhci_ops sdhci_at91_sama5d2_ops = { .set_clock = sdhci_at91_set_clock, .set_bus_width = sdhci_set_bus_width, .reset = sdhci_reset, - .set_uhs_signaling = sdhci_set_uhs_signaling, + .set_uhs_signaling = sdhci_at91_set_uhs_signaling, .set_power = sdhci_at91_set_power, }; diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index a983ba0349fb..7d275e72903a 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1823,6 +1823,9 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) struct sdhci_host *host = mmc_priv(mmc); unsigned long flags; + if (enable) + pm_runtime_get_noresume(host->mmc->parent); + spin_lock_irqsave(&host->lock, flags); if (enable) host->flags |= SDHCI_SDIO_IRQ_ENABLED; @@ -1831,6 +1834,9 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) sdhci_enable_sdio_irq_nolock(host, enable); spin_unlock_irqrestore(&host->lock, flags); + + if (!enable) + pm_runtime_put_noidle(host->mmc->parent); } static int sdhci_start_signal_voltage_switch(struct mmc_host *mmc, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da10b484bd25..bde769b11e3b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2057,9 +2057,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. */ - if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - revalidate_disk(ns->disk); - + if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + continue; + revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5e52034ab010..8a9c186898c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1983,8 +1983,10 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - if (!pci_device_is_present(pdev)) + if (!pci_device_is_present(pdev)) { nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + nvme_dev_disable(dev, false); + } flush_work(&dev->reset_work); nvme_uninit_ctrl(&dev->ctrl); diff --git a/drivers/pci/host/pcie-iproc-bcma.c b/drivers/pci/host/pcie-iproc-bcma.c index 8ce089043a27..46ca8ed031fe 100644 --- a/drivers/pci/host/pcie-iproc-bcma.c +++ b/drivers/pci/host/pcie-iproc-bcma.c @@ -44,8 +44,7 @@ static int iproc_pcie_bcma_probe(struct bcma_device *bdev) { struct device *dev = &bdev->dev; struct iproc_pcie *pcie; - LIST_HEAD(res); - struct resource res_mem; + LIST_HEAD(resources); int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); @@ -62,22 +61,23 @@ static int iproc_pcie_bcma_probe(struct bcma_device *bdev) pcie->base_addr = bdev->addr; - res_mem.start = bdev->addr_s[0]; - res_mem.end = bdev->addr_s[0] + SZ_128M - 1; - res_mem.name = "PCIe MEM space"; - res_mem.flags = IORESOURCE_MEM; - pci_add_resource(&res, &res_mem); + pcie->mem.start = bdev->addr_s[0]; + pcie->mem.end = bdev->addr_s[0] + SZ_128M - 1; + pcie->mem.name = "PCIe MEM space"; + pcie->mem.flags = IORESOURCE_MEM; + pci_add_resource(&resources, &pcie->mem); pcie->map_irq = iproc_pcie_bcma_map_irq; - ret = iproc_pcie_setup(pcie, &res); - if (ret) + ret = iproc_pcie_setup(pcie, &resources); + if (ret) { dev_err(dev, "PCIe controller setup failed\n"); - - pci_free_resource_list(&res); + pci_free_resource_list(&resources); + return ret; + } bcma_set_drvdata(bdev, pcie); - return ret; + return 0; } static void iproc_pcie_bcma_remove(struct bcma_device *bdev) diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c index a3de087976b3..7dcaddcd2f16 100644 --- a/drivers/pci/host/pcie-iproc-platform.c +++ b/drivers/pci/host/pcie-iproc-platform.c @@ -46,7 +46,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) struct device_node *np = dev->of_node; struct resource reg; resource_size_t iobase = 0; - LIST_HEAD(res); + LIST_HEAD(resources); int ret; of_id = of_match_device(iproc_pcie_of_match_table, dev); @@ -108,23 +108,24 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->phy = NULL; } - ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &res, &iobase); + ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &resources, + &iobase); if (ret) { - dev_err(dev, - "unable to get PCI host bridge resources\n"); + dev_err(dev, "unable to get PCI host bridge resources\n"); return ret; } pcie->map_irq = of_irq_parse_and_map_pci; - ret = iproc_pcie_setup(pcie, &res); - if (ret) + ret = iproc_pcie_setup(pcie, &resources); + if (ret) { dev_err(dev, "PCIe controller setup failed\n"); - - pci_free_resource_list(&res); + pci_free_resource_list(&resources); + return ret; + } platform_set_drvdata(pdev, pcie); - return ret; + return 0; } static int iproc_pcie_pltfm_remove(struct platform_device *pdev) diff --git a/drivers/pci/host/pcie-iproc.h b/drivers/pci/host/pcie-iproc.h index e84d93c53c7b..fa4226742bcd 100644 --- a/drivers/pci/host/pcie-iproc.h +++ b/drivers/pci/host/pcie-iproc.h @@ -68,6 +68,7 @@ struct iproc_pcie { #ifdef CONFIG_ARM struct pci_sys_data sysdata; #endif + struct resource mem; struct pci_bus *root_bus; struct phy *phy; int (*map_irq)(const struct pci_dev *, u8, u8); diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 7bb20684e9fa..d3145799b92f 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -113,7 +113,7 @@ struct alua_queue_data { #define ALUA_POLICY_SWITCH_ALL 1 static void alua_rtpg_work(struct work_struct *work); -static void alua_rtpg_queue(struct alua_port_group *pg, +static bool alua_rtpg_queue(struct alua_port_group *pg, struct scsi_device *sdev, struct alua_queue_data *qdata, bool force); static void alua_check(struct scsi_device *sdev, bool force); @@ -862,7 +862,13 @@ static void alua_rtpg_work(struct work_struct *work) kref_put(&pg->kref, release_port_group); } -static void alua_rtpg_queue(struct alua_port_group *pg, +/** + * alua_rtpg_queue() - cause RTPG to be submitted asynchronously + * + * Returns true if and only if alua_rtpg_work() will be called asynchronously. + * That function is responsible for calling @qdata->fn(). + */ +static bool alua_rtpg_queue(struct alua_port_group *pg, struct scsi_device *sdev, struct alua_queue_data *qdata, bool force) { @@ -870,8 +876,8 @@ static void alua_rtpg_queue(struct alua_port_group *pg, unsigned long flags; struct workqueue_struct *alua_wq = kaluad_wq; - if (!pg) - return; + if (!pg || scsi_device_get(sdev)) + return false; spin_lock_irqsave(&pg->lock, flags); if (qdata) { @@ -884,14 +890,12 @@ static void alua_rtpg_queue(struct alua_port_group *pg, pg->flags |= ALUA_PG_RUN_RTPG; kref_get(&pg->kref); pg->rtpg_sdev = sdev; - scsi_device_get(sdev); start_queue = 1; } else if (!(pg->flags & ALUA_PG_RUN_RTPG) && force) { pg->flags |= ALUA_PG_RUN_RTPG; /* Do not queue if the worker is already running */ if (!(pg->flags & ALUA_PG_RUNNING)) { kref_get(&pg->kref); - sdev = NULL; start_queue = 1; } } @@ -900,13 +904,17 @@ static void alua_rtpg_queue(struct alua_port_group *pg, alua_wq = kaluad_sync_wq; spin_unlock_irqrestore(&pg->lock, flags); - if (start_queue && - !queue_delayed_work(alua_wq, &pg->rtpg_work, - msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS))) { - if (sdev) - scsi_device_put(sdev); - kref_put(&pg->kref, release_port_group); + if (start_queue) { + if (queue_delayed_work(alua_wq, &pg->rtpg_work, + msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS))) + sdev = NULL; + else + kref_put(&pg->kref, release_port_group); } + if (sdev) + scsi_device_put(sdev); + + return true; } /* @@ -1007,11 +1015,13 @@ static int alua_activate(struct scsi_device *sdev, mutex_unlock(&h->init_mutex); goto out; } - fn = NULL; rcu_read_unlock(); mutex_unlock(&h->init_mutex); - alua_rtpg_queue(pg, sdev, qdata, true); + if (alua_rtpg_queue(pg, sdev, qdata, true)) + fn = NULL; + else + err = SCSI_DH_DEV_OFFLINED; kref_put(&pg->kref, release_port_group); out: if (fn) diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 763f012fdeca..87f5e694dbed 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -221,7 +221,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc) task->num_scatter = qc->n_elem; } else { for_each_sg(qc->sg, sg, qc->n_elem, si) - xfer += sg->length; + xfer += sg_dma_len(sg); task->total_xfer_len = xfer; task->num_scatter = si; diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index fe7469c901f7..ad33238cef17 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -2153,8 +2153,6 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) "Timer for the VP[%d] has stopped\n", vha->vp_idx); } - BUG_ON(atomic_read(&vha->vref_count)); - qla2x00_free_fcports(vha); mutex_lock(&ha->vport_lock); diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 73b12e41d992..8e63a7b90277 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3742,6 +3742,7 @@ typedef struct scsi_qla_host { struct qla8044_reset_template reset_tmplt; struct qla_tgt_counters tgt_counters; uint16_t bbcr; + wait_queue_head_t vref_waitq; } scsi_qla_host_t; struct qla27xx_image_status { @@ -3780,6 +3781,7 @@ struct qla_tgt_vp_map { mb(); \ if (__vha->flags.delete_progress) { \ atomic_dec(&__vha->vref_count); \ + wake_up(&__vha->vref_waitq); \ __bail = 1; \ } else { \ __bail = 0; \ @@ -3788,6 +3790,7 @@ struct qla_tgt_vp_map { #define QLA_VHA_MARK_NOT_BUSY(__vha) do { \ atomic_dec(&__vha->vref_count); \ + wake_up(&__vha->vref_waitq); \ } while (0) /* diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 5b09296b46a3..8f12f6baa6b8 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -4356,6 +4356,7 @@ qla2x00_update_fcports(scsi_qla_host_t *base_vha) } } atomic_dec(&vha->vref_count); + wake_up(&vha->vref_waitq); } spin_unlock_irqrestore(&ha->vport_slock, flags); } diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index cf7ba52bae66..3dfb54abc874 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -74,13 +74,14 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ - spin_lock_irqsave(&ha->vport_slock, flags); - while (atomic_read(&vha->vref_count)) { - spin_unlock_irqrestore(&ha->vport_slock, flags); - - msleep(500); + wait_event_timeout(vha->vref_waitq, atomic_read(&vha->vref_count), + 10*HZ); - spin_lock_irqsave(&ha->vport_slock, flags); + spin_lock_irqsave(&ha->vport_slock, flags); + if (atomic_read(&vha->vref_count)) { + ql_dbg(ql_dbg_vport, vha, 0xfffa, + "vha->vref_count=%u timeout\n", vha->vref_count.counter); + vha->vref_count = (atomic_t)ATOMIC_INIT(0); } list_del(&vha->list); qlt_update_vp_map(vha, RESET_VP_IDX); @@ -269,6 +270,7 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) spin_lock_irqsave(&ha->vport_slock, flags); atomic_dec(&vha->vref_count); + wake_up(&vha->vref_waitq); } i++; } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index bea819e5336d..4f361d8d84be 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4045,6 +4045,7 @@ struct scsi_qla_host *qla2x00_create_host(struct scsi_host_template *sht, spin_lock_init(&vha->work_lock); spin_lock_init(&vha->cmd_list_lock); + init_waitqueue_head(&vha->vref_waitq); sprintf(vha->host_str, "%s_%ld", QLA2XXX_DRIVER_NAME, vha->host_no); ql_dbg(ql_dbg_init, vha, 0x0041, diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 121de0aaa6ad..f753df25ba34 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -998,6 +998,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) result = get_user(val, ip); if (result) return result; + if (val > SG_MAX_CDB_SIZE) + return -ENOMEM; sfp->next_cmd_len = (val > 0) ? val : 0; return 0; case SG_GET_VERSION_NUM: diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index fabbe76203bb..4d079cdaa7a3 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -1938,6 +1938,11 @@ static void atmel_flush_buffer(struct uart_port *port) atmel_uart_writel(port, ATMEL_PDC_TCR, 0); atmel_port->pdc_tx.ofs = 0; } + /* + * in uart_flush_buffer(), the xmit circular buffer has just + * been cleared, so we have to reset tx_len accordingly. + */ + atmel_port->tx_len = 0; } /* @@ -2471,6 +2476,9 @@ static void atmel_console_write(struct console *co, const char *s, u_int count) pdc_tx = atmel_uart_readl(port, ATMEL_PDC_PTSR) & ATMEL_PDC_TXTEN; atmel_uart_writel(port, ATMEL_PDC_PTCR, ATMEL_PDC_TXTDIS); + /* Make sure that tx path is actually able to send characters */ + atmel_uart_writel(port, ATMEL_US_CR, ATMEL_US_TXEN); + uart_console_write(port, s, count, atmel_console_putchar); /* diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c index 770454e0dfa3..07390f8c3681 100644 --- a/drivers/tty/serial/mxs-auart.c +++ b/drivers/tty/serial/mxs-auart.c @@ -1085,7 +1085,7 @@ static void mxs_auart_settermios(struct uart_port *u, AUART_LINECTRL_BAUD_DIV_MAX); baud_max = u->uartclk * 32 / AUART_LINECTRL_BAUD_DIV_MIN; baud = uart_get_baud_rate(u, termios, old, baud_min, baud_max); - div = u->uartclk * 32 / baud; + div = DIV_ROUND_CLOSEST(u->uartclk * 32, baud); } ctrl |= AUART_LINECTRL_BAUD_DIVFRAC(div & 0x3F); diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 479e223f9cff..f029aad67183 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -520,8 +520,10 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb) */ tbuf_size = max_t(u16, sizeof(struct usb_hub_descriptor), wLength); tbuf = kzalloc(tbuf_size, GFP_KERNEL); - if (!tbuf) - return -ENOMEM; + if (!tbuf) { + status = -ENOMEM; + goto err_alloc; + } bufp = tbuf; @@ -734,6 +736,7 @@ static int rh_call_control (struct usb_hcd *hcd, struct urb *urb) } kfree(tbuf); + err_alloc: /* any errors get returned through the urb completion */ spin_lock_irq(&hcd_root_hub_lock); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1536aeb0abab..4e894d301c88 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2532,17 +2532,14 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); - if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) || + !test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags)) { rcu_read_unlock(); nfs_finish_clear_delegation_stateid(state, &stateid); return; } - if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { - rcu_read_unlock(); - return; - } - cred = get_rpccred(delegation->cred); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 010aff5c5a79..536009e50387 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -790,6 +790,7 @@ nfserrno (int errno) { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, }; int i; diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index d346d42c54d1..33db69be4832 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -200,22 +201,30 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; + xfs_extlen_t reserved; - resv = xfs_perag_resv(pag, type); if (used > ask) ask = used; - resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = ask - used; - mp->m_ag_max_usable -= ask; + reserved = ask - used; - trace_xfs_ag_resv_init(pag, type, ask); - - error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); - if (error) + error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); + xfs_warn(mp, +"Per-AG reservation for AG %u failed. Filesystem may run out of space.", + pag->pag_agno); + return error; + } - return error; + mp->m_ag_max_usable -= ask; + + resv = xfs_perag_resv(pag, type); + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = reserved; + + trace_xfs_ag_resv_init(pag, type, ask); + return 0; } /* Create a per-AG block reservation. */ @@ -223,6 +232,8 @@ int xfs_ag_resv_init( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0; @@ -231,23 +242,45 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(pag->pag_mount, - pag->pag_agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, - ask, used); + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) { + /* + * Because we didn't have per-AG reservations when the + * finobt feature was added we might not be able to + * reserve all needed blocks. Warn and fall back to the + * old and potentially buggy code in that case, but + * ensure we do have the reservation for the refcountbt. + */ + ask = used = 0; + + mp->m_inotbt_nores = true; + + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, + &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } } /* Create the AGFL metadata reservation */ if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, - &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; @@ -256,9 +289,16 @@ xfs_ag_resv_init( goto out; } +#ifdef DEBUG + /* need to read in the AGF for the ASSERT below to work */ + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); + if (error) + return error; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount); +#endif out: return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f52fd63fce19..5a508b011e27 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -796,17 +796,19 @@ xfs_bmap_extents_to_btree( if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - dfops->dop_low = true; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return -ENOSPC; + } /* * Allocation can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (dfops->dop_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; @@ -1278,7 +1280,6 @@ xfs_bmap_read_extents( /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ - bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : @@ -1291,9 +1292,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -1955,6 +1954,7 @@ xfs_bmap_add_extent_delay_real( */ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_state(ep, new->br_state); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); (*nextents)++; @@ -2293,6 +2293,7 @@ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ @@ -2312,12 +2313,14 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - struct xfs_mount *mp = tp->t_mountp; + struct xfs_mount *mp = ip->i_mount; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(*idx >= 0); ASSERT(*idx <= xfs_iext_count(ifp)); @@ -2376,7 +2379,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(&ip->i_df) - 1) { + if (*idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2416,7 +2419,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents -= 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2459,7 +2463,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2494,7 +2499,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_set_state(ep, newext); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2606,7 +2612,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2684,7 +2691,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2732,7 +2740,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 2, &r[0], state); - ip->i_d.di_nextents += 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2786,17 +2795,17 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, - 0, &tmp_logflags, XFS_DATA_FORK); + 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; @@ -2808,7 +2817,7 @@ xfs_bmap_add_extent_unwritten_real( *curp = cur; } - xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; @@ -2900,7 +2909,8 @@ xfs_bmap_add_extent_hole_delay( oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2921,7 +2931,8 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2937,7 +2948,8 @@ xfs_bmap_add_extent_hole_delay( temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); @@ -3913,17 +3925,13 @@ xfs_bmap_btalloc( * the first block that was allocated. */ ASSERT(*ap->firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *ap->firstblock) == - XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *ap->firstblock) < - XFS_FSB_TO_AGNO(mp, args.fsbno))); + XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= + XFS_FSB_TO_AGNO(mp, args.fsbno)); ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; - ASSERT(nullfb || fb_agno == args.agno || - (ap->dfops->dop_low && fb_agno < args.agno)); + ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; if (!(ap->flags & XFS_BMAPI_COWFORK)) ap->ip->i_d.di_nblocks += args.len; @@ -4249,6 +4257,19 @@ xfs_bmapi_read( return 0; } +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, @@ -4335,13 +4356,8 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); - /* - * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay - * might have merged it into one of the neighbouring ones. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4353,10 +4369,6 @@ xfs_bmapi_reserve_delalloc( if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) xfs_inode_set_cowblocks_tag(ip); - ASSERT(got->br_startoff <= aoff); - ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); - ASSERT(isnullstartblock(got->br_startblock)); - ASSERT(got->br_state == XFS_EXT_NORM); return 0; out_unreserve_blocks: @@ -4461,10 +4473,16 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_NORM; /* - * A wasdelay extent has been initialized, so shouldn't be flagged - * as unwritten. + * In the data fork, a wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + * + * For the cow fork, however, we convert delalloc reservations + * (extents allocated for speculative preallocation) to + * allocated unwritten extents, and only convert the unwritten + * extents to real extents when we're about to write the data. */ - if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && + (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4515,8 +4533,6 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; - ASSERT(whichfork != XFS_COW_FORK); - /* * Modify (by adding) the state flag, if writing. */ @@ -4541,8 +4557,8 @@ xfs_bmapi_convert_unwritten( return error; } - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->dfops, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, + &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4551,8 +4567,12 @@ xfs_bmapi_convert_unwritten( * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. + * + * Note: If we're only converting cow fork extents, there aren't + * any on-disk updates to make, so we don't need to log anything. */ - bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (whichfork != XFS_COW_FORK) + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4626,15 +4646,15 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); - ASSERT(tp != NULL); + ASSERT(tp != NULL || + (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == + (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4840,13 +4860,9 @@ xfs_bmapi_write( if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, *firstblock) <= XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock) || - (dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock))); + bma.cur->bc_private.b.firstblock)); *firstblock = bma.cur->bc_private.b.firstblock; } xfs_btree_del_cursor(bma.cur, @@ -4881,34 +4897,59 @@ xfs_bmap_split_indlen( xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t stolen = 0; + xfs_filblks_t resfactor; /* * Steal as many blocks as we can to try and satisfy the worst case * indlen for both new extents. */ - while (nres > ores && avail) { - nres--; - avail--; - stolen++; - } + if (ores < nres && avail) + stolen = XFS_FILBLKS_MIN(nres - ores, avail); + ores += stolen; + + /* nothing else to do if we've satisfied the new reservation */ + if (ores >= nres) + return stolen; + + /* + * We can't meet the total required reservation for the two extents. + * Calculate the percent of the overall shortage between both extents + * and apply this percentage to each of the requested indlen values. + * This distributes the shortage fairly and reduces the chances that one + * of the two extents is left with nothing when extents are repeatedly + * split. + */ + resfactor = (ores * 100); + do_div(resfactor, nres); + len1 *= resfactor; + do_div(len1, 100); + len2 *= resfactor; + do_div(len2, 100); + ASSERT(len1 + len2 <= ores); + ASSERT(len1 < *indlen1 && len2 < *indlen2); /* - * The only blocks available are those reserved for the original - * extent and what we can steal from the extent being removed. - * If this still isn't enough to satisfy the combined - * requirements for the two new extents, skim blocks off of each - * of the new reservations until they match what is available. + * Hand out the remainder to each extent. If one of the two reservations + * is zero, we want to make sure that one gets a block first. The loop + * below starts with len1, so hand len2 a block right off the bat if it + * is zero. */ - while (nres > ores) { - if (len1) { - len1--; - nres--; + ores -= (len1 + len2); + ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); + if (ores && !len2 && *indlen2) { + len2++; + ores--; + } + while (ores) { + if (len1 < *indlen1) { + len1++; + ores--; } - if (nres == ores) + if (!ores) break; - if (len2) { - len2--; - nres--; + if (len2 < *indlen2) { + len2++; + ores--; } } @@ -5656,8 +5697,8 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, dfops, - &logflags); + whichfork, &lastx, &cur, &del, + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5714,8 +5755,9 @@ __xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &prev, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &prev, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5723,8 +5765,9 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &del, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &del, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f76c1693ff01..5c3918678bb6 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,8 +453,8 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -494,8 +494,8 @@ xfs_bmbt_alloc_block( if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - cur->bc_private.b.dfops->dop_low = true; args.fsbno = cur->bc_private.b.firstblock; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } @@ -512,7 +512,7 @@ xfs_bmbt_alloc_block( goto error0; cur->bc_private.b.dfops->dop_low = true; } - if (args.fsbno == NULLFSBLOCK) { + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 21e6a6ab6b9a..2849d3fa3d0b 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -810,7 +810,8 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - ASSERT(fsbno != NULLFSBLOCK); + if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, mp->m_bsize, lock, &bp, ops); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..3b0fc1afada5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -491,7 +491,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index f2dc1a950c85..1bdf2888295b 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2633,7 +2633,7 @@ xfs_da_read_buf( /* * Readahead the dir/attr block. */ -xfs_daddr_t +int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, @@ -2664,7 +2664,5 @@ xfs_da_reada_buf( if (mapp != &map) kmem_free(mapp); - if (error) - return -1; - return mappedbno; + return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 98c75cbe6ac2..4e29cb6a3627 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, +int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 75a557432d0f..bbd1238852b3 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .verify_write = xfs_dir3_free_write_verify, }; +/* Everything ok in the free block header? */ +static bool +xfs_dir3_free_header_check( + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + unsigned int firstdb; + int maxbests; + + maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); + firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - + xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * + maxbests; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (be32_to_cpu(hdr3->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr3->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) + return false; + } else { + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (be32_to_cpu(hdr->firstdb) != firstdb) + return false; + if (be32_to_cpu(hdr->nvalid) > maxbests) + return false; + if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) + return false; + } + return true; +} static int __xfs_dir3_free_read( @@ -168,11 +204,22 @@ __xfs_dir3_free_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { + xfs_buf_ioerror(*bpp, -EFSCORRUPTED); + xfs_verifier_error(*bpp); + xfs_trans_brelse(tp, *bpp); + return -EFSCORRUPTED; + } /* try read returns without an error or *bpp if it lands in a hole */ - if (!err && tp && *bpp) + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); - return err; + + return 0; } int diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d45c03779dae..a2818f6e8598 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) return mp->m_sb.sb_inoalignmt; return 1; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 6c6b95947e71..b9c351ff0422 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -478,3 +501,64 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453217ce..aa81e2e63f3f 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 222e103356c6..25c1e078aef6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,6 +26,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_btree.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -429,11 +430,13 @@ xfs_iformat_btree( /* REFERENCED */ int nrecs; int size; + int level; ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); /* * blow out if -- fork has less extents than can fit in @@ -446,7 +449,8 @@ xfs_iformat_btree( XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -497,15 +501,14 @@ xfs_iread_extents( * We know that the size is valid (it's checked in iformat_btree) */ ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); error = xfs_bmap_read_extents(tp, ip, whichfork); if (error) { xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; return error; } xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + ifp->if_flags |= XFS_IFEXTENTS; return 0; } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 06763f5cc701..0457abe4118a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -279,54 +279,49 @@ xfs_end_io( struct xfs_ioend *ioend = container_of(work, struct xfs_ioend, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_off_t offset = ioend->io_offset; + size_t size = ioend->io_size; int error = ioend->io_bio->bi_error; /* - * Set an error if the mount has shut down and proceed with end I/O - * processing so it can perform whatever cleanups are necessary. + * Just clean up the in-memory strutures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = -EIO; + goto done; + } /* - * For a CoW extent, we need to move the mapping from the CoW fork - * to the data fork. If instead an error happened, just dump the - * new blocks. + * Clean up any COW blocks on an I/O error. */ - if (ioend->io_type == XFS_IO_COW) { - if (error) - goto done; - if (ioend->io_bio->bi_error) { - error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size); - goto done; + if (unlikely(error)) { + switch (ioend->io_type) { + case XFS_IO_COW: + xfs_reflink_cancel_cow_range(ip, offset, size, true); + break; } - error = xfs_reflink_end_cow(ip, ioend->io_offset, - ioend->io_size); - if (error) - goto done; + + goto done; } /* - * For unwritten extents we need to issue transactions to convert a - * range to normal written extens after the data I/O has finished. - * Detecting and handling completion IO errors is done individually - * for each case as different cleanup operations need to be performed - * on error. + * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (error) - goto done; - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend, error); - } else { - ASSERT(!xfs_ioend_is_append(ioend) || - ioend->io_type == XFS_IO_COW); + switch (ioend->io_type) { + case XFS_IO_COW: + error = xfs_reflink_end_cow(ip, offset, size); + break; + case XFS_IO_UNWRITTEN: + error = xfs_iomap_write_unwritten(ip, offset, size); + break; + default: + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + break; } done: + if (ioend->io_append_trans) + error = xfs_setfilesize_ioend(ioend, error); xfs_destroy_ioend(ioend, error); } @@ -486,6 +481,12 @@ xfs_submit_ioend( struct xfs_ioend *ioend, int status) { + /* Convert CoW extents to regular */ + if (!status && ioend->io_type == XFS_IO_COW) { + status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + } + /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && @@ -1257,44 +1258,6 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ -static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - STATIC int __xfs_get_blocks( struct inode *inode, @@ -1432,13 +1395,6 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index efb8ccd6bbf2..5c395e485170 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) */ int xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) + struct xfs_inode *ip) { - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; + struct xfs_trans *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); /* * Figure out if there are any blocks beyond the end @@ -944,6 +945,10 @@ xfs_free_eofblocks( error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * If there are blocks after the end of file, truncate the file to its + * current size to free them up. + */ if (!error && (nimaps != 0) && (imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks)) { @@ -954,22 +959,13 @@ xfs_free_eofblocks( if (error) return error; - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) - return -EAGAIN; - } + /* wait on dio to ensure i_size has settled */ + inode_dio_wait(VFS_I(ip)); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -997,8 +993,6 @@ xfs_free_eofblocks( } xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -1393,10 +1387,16 @@ xfs_shift_file_space( xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; + uint resblks; ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); if (direction == SHIFT_LEFT) { + /* + * Reserve blocks to cover potential extent merges after left + * shift operations. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); next_fsb = XFS_B_TO_FSB(mp, offset + len); stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); } else { @@ -1404,6 +1404,7 @@ xfs_shift_file_space( * If right shift, delegate the work of initialization of * next_fsb to xfs_bmap_shift_extent as it has ilock held. */ + resblks = 0; next_fsb = NULLFSBLOCK; stop_fsb = XFS_B_TO_FSB(mp, offset); } @@ -1415,7 +1416,7 @@ xfs_shift_file_space( * into the accessible region of the file. */ if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); + error = xfs_free_eofblocks(ip); if (error) return error; } @@ -1445,21 +1446,14 @@ xfs_shift_file_space( } while (!error && !done) { - /* - * We would need to reserve permanent block for transaction. - * This will come into picture when after shifting extent into - * hole we found that adjacent extents can be merged which - * may lead to freeing of a block during record update. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, + &tp); if (error) break; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + ip->i_gdquot, ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 68a621a8e0c0..f1005393785c 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); -int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - bool need_iolock); +int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, struct xfs_swapext *sx); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2975cb2319f4..0306168af332 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks( */ bp->b_last_error = 0; bp->b_retries = 0; + bp->b_first_retry_time = 0; xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 162dc186cf04..29c2f997aedf 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -45,18 +45,7 @@ xfs_extent_busy_insert( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); new->agno = agno; new->bno = bno; new->length = len; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a5d64b5f35a..1209ad29e902 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -554,6 +554,15 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; + + /* + * We can't properly handle unaligned direct I/O to reflink + * files yet, as we can't unshare a partial block. + */ + if (xfs_is_reflink_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); + return -EREMCHG; + } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; @@ -675,8 +684,10 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; + int iolock; +write_retry: + iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); @@ -686,7 +697,6 @@ xfs_file_buffered_aio_write( /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); -write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) @@ -702,18 +712,21 @@ xfs_file_buffered_aio_write( * running at the same time. */ if (ret == -EDQUOT && !enospc) { + xfs_rw_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; + iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); - eofb.eof_scan_owner = ip->i_ino; /* for locking */ + + xfs_rw_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; @@ -721,7 +734,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + if (iolock) + xfs_rw_iunlock(ip, iolock); return ret; } @@ -987,9 +1001,9 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); - return 0; + return error; } STATIC int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 29cc9886a3cb..3fb1f3fb8efe 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,13 +1324,10 @@ xfs_inode_free_eofblocks( int flags, void *args) { - int ret; + int ret = 0; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1358,21 +1355,19 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); - - /* don't revisit the inode if we're not waiting */ - if (ret == -EAGAIN && !(flags & SYNC_WAIT)) - ret = 0; + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -1419,15 +1414,10 @@ __xfs_inode_free_quota_eofblocks( struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* - * Set the scan owner to avoid a potential livelock. Otherwise, the scan - * can repeatedly trylock on the inode we're currently processing. We - * run a sync scan to increase effectiveness and use the union filter to + * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_scan_owner = ip->i_ino; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { @@ -1579,12 +1569,9 @@ xfs_inode_free_cowblocks( { int ret; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1617,28 +1604,16 @@ xfs_inode_free_cowblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* Free the CoW blocks */ - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - } + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); - if (need_iolock) { - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a1e02f4708ab..8a7c849b4dea 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,7 +27,6 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; - xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; - dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 512ff13ed66a..e50636c9a89c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1624,7 +1624,7 @@ xfs_itruncate_extents( /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block); + last_block, true); if (error) goto out; @@ -1701,32 +1701,34 @@ xfs_release( if (xfs_can_free_eofblocks(ip, false)) { /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding + * (e.g. streaming writes from the NFS server), truncating the + * blocks past EOF will cause fragmentation to occur. + * + * In this case don't do the truncation, but we have to be + * careful how we detect this case. Blocks beyond EOF show up as + * i_delayed_blks even when the inode is clean, so we need to + * truncate them away first before checking for a dirty release. + * Hence on the first dirty close we will still remove the + * speculative allocation, but after that we will leave it in + * place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + /* * If we can't get the iolock just skip truncating the blocks * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the + * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != -EAGAIN) - return error; + if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + error = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (error) + return error; + } /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) @@ -1801,22 +1803,23 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. + * We try to use a per-AG reservation for any block needed by the finobt + * tree, but as the finobt feature predates the per-AG reservation + * support a degraded file system might not have enough space for the + * reservation at mount time. In that case try to dip into the reserved + * pool and pray. * * Send a warning if the reservation does happen to fail, as the inode * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (unlikely(mp->m_inotbt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + } else { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); + } if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1912,8 +1915,11 @@ xfs_inactive( * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. */ - if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(mp, ip, false); + if (xfs_can_free_eofblocks(ip, true)) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } return; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e8889614cec3..360562484e7b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -637,6 +637,11 @@ xfs_file_iomap_begin_delay( goto out_unlock; } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -685,7 +690,7 @@ xfs_iomap_write_allocate( int nres; if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK; + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; /* * Make sure that the dquots are there. @@ -1061,7 +1066,8 @@ xfs_file_iomap_end_delalloc( struct xfs_inode *ip, loff_t offset, loff_t length, - ssize_t written) + ssize_t written, + struct iomap *iomap) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; @@ -1080,14 +1086,14 @@ xfs_file_iomap_end_delalloc( end_fsb = XFS_B_TO_FSB(mp, offset + length); /* - * Trim back delalloc blocks if we didn't manage to write the whole - * range reserved. + * Trim delalloc blocks if they were allocated by this write and we + * didn't manage to write the whole range. * * We don't need to care about racing delalloc as we hold i_mutex * across the reserve/allocate/unreserve calls. If there are delalloc * blocks in the range, they are ours. */ - if (start_fsb < end_fsb) { + if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), XFS_FSB_TO_B(mp, end_fsb) - 1); @@ -1117,7 +1123,7 @@ xfs_file_iomap_end( { if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written); + length, written, iomap); return 0; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..13796f212f98 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -502,8 +502,7 @@ STATIC void xfs_set_inoalignment(xfs_mount_t *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; else mp->m_inoalign_mask = 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..1bf878b0492c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -140,6 +140,7 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ + bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4d3f74e3c5e1..2252f163c38f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -82,11 +82,22 @@ * mappings are a reservation against the free space in the filesystem; * adjacent mappings can also be combined into fewer larger mappings. * + * As an optimization, the CoW extent size hint (cowextsz) creates + * outsized aligned delalloc reservations in the hope of landing out of + * order nearby CoW writes in a single extent on disk, thereby reducing + * fragmentation and improving future performance. + * + * D: --RRRRRRSSSRRRRRRRR--- (data fork) + * C: ------DDDDDDD--------- (CoW fork) + * * When dirty pages are being written out (typically in writepage), the - * delalloc reservations are converted into real mappings by allocating - * blocks and replacing the delalloc mapping with real ones. A delalloc - * mapping can be replaced by several real ones if the free space is - * fragmented. + * delalloc reservations are converted into unwritten mappings by + * allocating blocks and replacing the delalloc mapping with real ones. + * A delalloc mapping can be replaced by several unwritten ones if the + * free space is fragmented. + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUUUUUU--------- * * We want to adapt the delalloc mechanism for copy-on-write, since the * write paths are similar. The first two steps (creating the reservation @@ -101,13 +112,29 @@ * Block-aligned directio writes will use the same mechanism as buffered * writes. * + * Just prior to submitting the actual disk write requests, we convert + * the extents representing the range of the file actually being written + * (as opposed to extra pieces created for the cowextsize hint) to real + * extents. This will become important in the next step: + * + * D: --RRRRRRSSSRRRRRRRR--- + * C: ------UUrrUUU--------- + * * CoW remapping must be done after the data block write completes, * because we don't want to destroy the old data fork map until we're sure * the new block has been written. Since the new mappings are kept in a * separate fork, we can simply iterate these mappings to find the ones * that cover the file blocks that we just CoW'd. For each extent, simply * unmap the corresponding range in the data fork, map the new range into - * the data fork, and remove the extent from the CoW fork. + * the data fork, and remove the extent from the CoW fork. Because of + * the presence of the cowextsize hint, however, we must be careful + * only to remap the blocks that we've actually written out -- we must + * never remap delalloc reservations nor CoW staging blocks that have + * yet to be written. This corresponds exactly to the real extents in + * the CoW fork: + * + * D: --RRRRRRrrSRRRRRRRR--- + * C: ------UU--UUU--------- * * Since the remapping operation can be applied to an arbitrary file * range, we record the need for the remap step as a flag in the ioend @@ -296,6 +323,65 @@ xfs_reflink_reserve_cow( return 0; } +/* Convert part of an unwritten CoW extent to a real one. */ +STATIC int +xfs_reflink_convert_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_defer_ops *dfops) +{ + struct xfs_bmbt_irec irec = *imap; + xfs_fsblock_t first_block; + int nimaps = 1; + + if (imap->br_state == XFS_EXT_NORM) + return 0; + + xfs_trim_extent(&irec, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, &irec); + if (irec.br_blockcount == 0) + return 0; + return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, + 0, &irec, &nimaps, dfops); +} + +/* Convert all of the unwritten CoW extents in a file's range to real ones. */ +int +xfs_reflink_convert_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec got; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + xfs_extnum_t idx; + bool found; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* Convert all the extents to real from unwritten. */ + for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + found && got.br_startoff < end_fsb; + found = xfs_iext_get_extent(ifp, ++idx, &got)) { + error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, + end_fsb - offset_fsb, &dfops); + if (error) + break; + } + + /* Finish up. */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ static int __xfs_reflink_allocate_cow( @@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow( goto out_unlock; ASSERT(nimaps == 1); + /* Make sure there's a CoW reservation for it. */ error = xfs_reflink_reserve_cow(ip, &imap, &shared); if (error) goto out_trans_cancel; @@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow( goto out_trans_cancel; } + /* Allocate the entire reservation as unwritten blocks. */ xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, - XFS_BMAPI_COWFORK, &first_block, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), &imap, &nimaps, &dfops); if (error) goto out_trans_cancel; + /* Finish up. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_trans_cancel; @@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range( if (error) { trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_); - break; + return error; } } - return error; + /* Convert the CoW extents to regular. */ + return xfs_reflink_convert_cow(ip, offset, count); } /* @@ -481,14 +571,18 @@ xfs_reflink_trim_irec_to_next_cow( } /* - * Cancel all pending CoW reservations for some block range of an inode. + * Cancel CoW reservations for some block range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb) + xfs_fileoff_t end_fsb, + bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, prev, del; @@ -515,7 +609,7 @@ xfs_reflink_cancel_cow_blocks( &idx, &got, &del); if (error) break; - } else { + } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); @@ -558,13 +652,17 @@ xfs_reflink_cancel_cow_blocks( } /* - * Cancel all pending CoW reservations for some byte range of an inode. + * Cancel CoW reservations for some byte range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; @@ -590,7 +688,8 @@ xfs_reflink_cancel_cow_range( xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, + cancel_real); if (error) goto out_cancel; @@ -669,6 +768,16 @@ xfs_reflink_end_cow( ASSERT(!isnullstartblock(got.br_startblock)); + /* + * Don't remap unwritten extents; these are + * speculatively preallocated CoW extents that have been + * allocated but have not yet been involved in a write. + */ + if (got.br_state == XFS_EXT_UNWRITTEN) { + idx--; + goto next_extent; + } + /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); rlen = del.br_blockcount; @@ -885,13 +994,14 @@ STATIC int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, - xfs_extlen_t cowextsize) + xfs_extlen_t cowextsize, + bool is_dedupe) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -912,6 +1022,10 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } + if (!is_dedupe) { + xfs_trans_ichgtime(tp, dest, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1428,7 +1542,8 @@ xfs_reflink_remap_range( !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + is_dedupe); out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); @@ -1580,7 +1695,7 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 97ea9b487884..a57966fc7ddd 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap, bool *need_alloc); extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, @@ -37,9 +39,9 @@ extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb); + xfs_fileoff_t end_fsb, bool cancel_real); extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count); + xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..dbbd3f1fd2b7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -948,7 +948,7 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_remove); if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) xfs_warn(ip->i_mount, "Error %d while evicting CoW blocks for inode %llu.", diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d..828f383df121 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3183,6 +3183,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __field(xfs_fileoff_t, lblk) __field(xfs_extlen_t, len) __field(xfs_fsblock_t, pblk) + __field(int, state) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -3190,13 +3191,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->lblk = irec->br_startoff; __entry->len = irec->br_blockcount; __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, __entry->len, - __entry->pblk) + __entry->pblk, + __entry->state) ); #define DEFINE_INODE_IREC_EVENT(name) \ DEFINE_EVENT(xfs_inode_irec_class, name, \ @@ -3345,11 +3348,12 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); -DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01c0b9cc3915..8c58db2c09c6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 254698856b8f..8b35bdbdc214 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -739,6 +739,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, + int nr) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { diff --git a/kernel/padata.c b/kernel/padata.c index 7848f0566403..b4a3c0ae649b 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -190,19 +190,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) reorder = &next_queue->reorder; + spin_lock(&reorder->lock); if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - spin_unlock(&reorder->lock); pd->processed++; + spin_unlock(&reorder->lock); goto out; } + spin_unlock(&reorder->lock); if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); diff --git a/lib/syscall.c b/lib/syscall.c index 63239e097b13..a72cd0996230 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -11,6 +11,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (!try_get_task_stack(target)) { /* Task has no stack, so the task isn't in a syscall. */ + *sp = *pc = 0; *callno = -1; return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b6adedbafaf5..65c36acf8a6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4471,6 +4471,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { struct page *page = NULL; spinlock_t *ptl; + pte_t pte; retry: ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); @@ -4480,12 +4481,13 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, */ if (!pmd_huge(*pmd)) goto out; - if (pmd_present(*pmd)) { + pte = huge_ptep_get((pte_t *)pmd); + if (pte_present(pte)) { page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { - if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); __migration_entry_wait(mm, (pte_t *)pmd, ptl); goto retry; diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..cd37c1c7e21b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1295,7 +1295,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1335,7 +1335,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); diff --git a/mm/workingset.c b/mm/workingset.c index 33f6f4db32fd..4c4f05655e6e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -492,7 +492,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = __list_lru_init(&workingset_shadow_nodes, true, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2efb335deada..25a30be862e9 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -469,11 +470,16 @@ static int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; + unsigned int noio_flag; int ret; BUG_ON(con->sock); + + /* sock_create_kern() allocates with GFP_KERNEL */ + noio_flag = memalloc_noio_save(); ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); + memalloc_noio_restore(noio_flag); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index 3f4efcb85df5..3490d21ab9e7 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -265,6 +265,10 @@ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize) /* NOTE: overflow flag is not cleared */ spin_unlock_irqrestore(&f->lock, flags); + /* close the old pool and wait until all users are gone */ + snd_seq_pool_mark_closing(oldpool); + snd_use_lock_sync(&f->use_lock); + /* release cells in old pool */ for (cell = oldhead; cell; cell = next) { next = cell->next; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 112caa2d3c14..bb1aad39d987 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4846,6 +4846,7 @@ enum { ALC292_FIXUP_DISABLE_AAMIX, ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK, ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, + ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, ALC275_FIXUP_DELL_XPS, ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE, ALC293_FIXUP_LENOVO_SPK_NOISE, @@ -5446,6 +5447,15 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE }, + [ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x18, 0x01a1913c }, /* use as headset mic, without its own jack detect */ + { } + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE + }, [ALC275_FIXUP_DELL_XPS] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -5518,7 +5528,7 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc298_fixup_speaker_volume, .chained = true, - .chain_id = ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, + .chain_id = ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, }, [ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER] = { .type = HDA_FIXUP_PINS, diff --git a/sound/soc/atmel/atmel-classd.c b/sound/soc/atmel/atmel-classd.c index 89ac5f5a93eb..7ae46c2647d4 100644 --- a/sound/soc/atmel/atmel-classd.c +++ b/sound/soc/atmel/atmel-classd.c @@ -349,7 +349,7 @@ static int atmel_classd_codec_dai_digital_mute(struct snd_soc_dai *codec_dai, } #define CLASSD_ACLK_RATE_11M2896_MPY_8 (112896 * 100 * 8) -#define CLASSD_ACLK_RATE_12M288_MPY_8 (12228 * 1000 * 8) +#define CLASSD_ACLK_RATE_12M288_MPY_8 (12288 * 1000 * 8) static struct { int rate; diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index b5b1934d8550..bef8a4546c12 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -448,7 +448,7 @@ static int skl_tplg_set_module_init_data(struct snd_soc_dapm_widget *w) if (bc->set_params != SKL_PARAM_INIT) continue; - mconfig->formats_config.caps = (u32 *)&bc->params; + mconfig->formats_config.caps = (u32 *)bc->params; mconfig->formats_config.caps_size = bc->size; break; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a29786dd9522..4d28a9ddbee0 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - kvm->buses[bus_idx]->ioeventfd_count--; + if (kvm->buses[bus_idx]) + kvm->buses[bus_idx]->ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f9ee2929cfe..f4c6d4f6d2e8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -720,8 +720,11 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - for (i = 0; i < KVM_NR_BUSES; i++) - kvm_io_bus_destroy(kvm->buses[i]); + for (i = 0; i < KVM_NR_BUSES; i++) { + if (kvm->buses[i]) + kvm_io_bus_destroy(kvm->buses[i]); + kvm->buses[i] = NULL; + } kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); @@ -3463,6 +3466,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3480,6 +3485,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && @@ -3530,6 +3537,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3542,6 +3551,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; + if (!bus) + return -ENOMEM; + /* exclude ioeventfd which is limited by maximum fd */ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; @@ -3561,37 +3573,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, } /* Caller must hold slots_lock. */ -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int i, r; + int i; struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - r = -ENOENT; + if (!bus) + return; + for (i = 0; i < bus->dev_count; i++) if (bus->range[i].dev == dev) { - r = 0; break; } - if (r) - return r; + if (i == bus->dev_count) + return; new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); - if (!new_bus) - return -ENOMEM; + if (!new_bus) { + pr_err("kvm: failed to shrink bus, removing it completely\n"); + goto broken; + } memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); +broken: rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); - return r; + return; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, @@ -3604,6 +3620,8 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + if (!bus) + goto out_unlock; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0)