diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 44c6e57303988..500d5d8937cbb 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -511,6 +511,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit /sys/devices/system/cpu/vulnerabilities/mmio_stale_data + /sys/devices/system/cpu/vulnerabilities/retbleed Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst index 9393c50b5afc9..c98fd11907cc8 100644 --- a/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst +++ b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst @@ -230,6 +230,20 @@ The possible values in this file are: * - 'Mitigation: Clear CPU buffers' - The processor is vulnerable and the CPU buffer clearing mitigation is enabled. + * - 'Unknown: No mitigations' + - The processor vulnerability status is unknown because it is + out of Servicing period. Mitigation is not attempted. + +Definitions: +------------ + +Servicing period: The process of providing functional and security updates to +Intel processors or platforms, utilizing the Intel Platform Update (IPU) +process or other similar mechanisms. + +End of Servicing Updates (ESU): ESU is the date at which Intel will no +longer provide Servicing, such as through IPU or other similar update +processes. ESU dates will typically be aligned to end of quarter. If the processor is vulnerable then the following information is appended to the above information: diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index f2ab8a5b6a4b8..7f553859dba82 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -271,7 +271,7 @@ poll cycle or the number of packets processed reaches netdev_budget. netdev_max_backlog ------------------ -Maximum number of packets, queued on the INPUT side, when the interface +Maximum number of packets, queued on the INPUT side, when the interface receives packets faster than kernel can process them. netdev_rss_key diff --git a/Makefile b/Makefile index 48140575f960b..a80179d2c0057 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 -SUBLEVEL = 139 +SUBLEVEL = 140 EXTRAVERSION = NAME = Dare mighty things diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index ca42d58e8c821..78263dadd00da 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -220,6 +220,8 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_ARM64_ERRATUM_1286807 { ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + }, + { /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), }, diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 286cec4d86d7b..cc6ed74960501 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -107,7 +107,7 @@ #define R1(i) (((i)>>21)&0x1f) #define R2(i) (((i)>>16)&0x1f) #define R3(i) ((i)&0x1f) -#define FR3(i) ((((i)<<1)&0x1f)|(((i)>>6)&1)) +#define FR3(i) ((((i)&0x1f)<<1)|(((i)>>6)&1)) #define IM(i,n) (((i)>>1&((1<<(n-1))-1))|((i)&1?((0-1L)<<(n-1)):0)) #define IM5_2(i) IM((i)>>16,5) #define IM5_3(i) IM((i),5) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index ec801d3bbb37a..137a170f47d4f 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -77,6 +77,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) memcpy(dst, src, arch_task_struct_size); dst->thread.fpu.regs = dst->thread.fpu.fprs; + + /* + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. + */ + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; + return 0; } @@ -134,13 +146,11 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.flags = 0; if (new_stackp) frame->childregs.gprs[15] = new_stackp; - - /* Don't copy runtime instrumentation info */ - p->thread.ri_cb = NULL; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Don't copy guarded storage control block */ - p->thread.gs_cb = NULL; - p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index bd8516e6c353c..42173a7be3bb4 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1114,6 +1114,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { reg->config = mask; + + /* + * The Arch LBR HW can retrieve the common branch types + * from the LBR_INFO. It doesn't require the high overhead + * SW disassemble. + * Enable the branch type by default for the Arch LBR. + */ + reg->reg |= X86_BR_TYPE_SAVE; return 0; } diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index bbd1120ae1610..fa9289718147a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -657,6 +657,22 @@ int snb_pci2phy_map_init(int devid) return 0; } +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * SNB IMC counters are 32-bit and are laid out back to back + * in MMIO space. Therefore we must use a 32-bit accessor function + * using readq() from uncore_mmio_read_counter() causes problems + * because it is reading 64-bit at a time. This is okay for the + * uncore_perf_event_update() function because it drops the upper + * 32-bits but not okay for plain uncore_read_counter() as invoked + * in uncore_pmu_event_start(). + */ + return (u64)readl(box->io_addr + hwc->event_base); +} + static struct pmu snb_uncore_imc_pmu = { .task_ctx_nr = perf_invalid_context, .event_init = snb_uncore_imc_event_init, @@ -676,7 +692,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = { .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = uncore_mmio_read_counter, + .read_counter = snb_uncore_imc_read_counter, }; static struct intel_uncore_type snb_uncore_imc = { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 37ba0cdf99aa8..f507ad7c7fd7b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -429,7 +429,8 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index aa4ee46f00ce5..a300a19255b66 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -424,7 +424,8 @@ static void __init mmio_select_mitigation(void) u64 ia32_cap; if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -529,6 +530,8 @@ out: pr_info("TAA: %s\n", taa_strings[taa_mitigation]); if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); } static void __init md_clear_select_mitigation(void) @@ -2198,6 +2201,9 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2344,6 +2350,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -2403,7 +2410,10 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9fc91482e85e3..56573241d0293 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1024,7 +1024,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) -#define NO_EIBRS_PBRSB BIT(9) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1045,6 +1046,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1063,9 +1069,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1080,18 +1086,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), {} }; @@ -1245,10 +1251,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (cpu_matches(cpu_vuln_blacklist, MMIO) && - !arch_cap_mmio_immune(ia32_cap)) - setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index c451d5f6422f6..cc071c4c65240 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -93,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) diff --git a/block/blk-mq.c b/block/blk-mq.c index 90f64bb42fbd1..cfc039fabf8ce 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1402,7 +1402,8 @@ out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ - if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) + if ((!list_empty(list) || errors || needs_resource || + ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) q->mq_ops->commit_rqs(hctx); /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -2080,6 +2081,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); if (ret != BLK_STS_OK) { + errors++; if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_request_bypass_insert(rq, false, @@ -2087,7 +2089,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; } blk_mq_end_request(rq, ret); - errors++; } else queued++; } diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index 6c7d05b37c986..7df0c6e3ba63c 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -148,7 +148,7 @@ void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) unsigned int cpu; for_each_cpu(cpu, policy->related_cpus) { - struct acpi_processor *pr = per_cpu(processors, policy->cpu); + struct acpi_processor *pr = per_cpu(processors, cpu); if (pr) freq_qos_remove_request(&pr->thermal_req); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e4517d483bdc3..b10410585a746 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1031,6 +1031,11 @@ loop_set_status_from_info(struct loop_device *lo, lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; + + /* loff_t vars have been assigned __u64 */ + if (lo->lo_offset < 0 || lo->lo_sizelimit < 0) + return -EOVERFLOW; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 884317ee1759f..0043dec37a870 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6278,11 +6278,11 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; + md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ if (mddev->event_work.func) flush_workqueue(md_misc_wq); - md_bitmap_destroy(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -6299,6 +6299,7 @@ void md_stop(struct mddev *mddev) /* stop the array and free an attached data structures. * This is called from dm-raid */ + __md_stop_writes(mddev); __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 325b20729d8ba..b0f8d551b61db 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1988,30 +1988,24 @@ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) */ void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution) { - /* check that the bond is not initialized yet */ - if (!MAC_ADDRESS_EQUAL(&(BOND_AD_INFO(bond).system.sys_mac_addr), - bond->dev->dev_addr)) { - - BOND_AD_INFO(bond).aggregator_identifier = 0; - - BOND_AD_INFO(bond).system.sys_priority = - bond->params.ad_actor_sys_prio; - if (is_zero_ether_addr(bond->params.ad_actor_system)) - BOND_AD_INFO(bond).system.sys_mac_addr = - *((struct mac_addr *)bond->dev->dev_addr); - else - BOND_AD_INFO(bond).system.sys_mac_addr = - *((struct mac_addr *)bond->params.ad_actor_system); + BOND_AD_INFO(bond).aggregator_identifier = 0; + BOND_AD_INFO(bond).system.sys_priority = + bond->params.ad_actor_sys_prio; + if (is_zero_ether_addr(bond->params.ad_actor_system)) + BOND_AD_INFO(bond).system.sys_mac_addr = + *((struct mac_addr *)bond->dev->dev_addr); + else + BOND_AD_INFO(bond).system.sys_mac_addr = + *((struct mac_addr *)bond->params.ad_actor_system); - /* initialize how many times this module is called in one - * second (should be about every 100ms) - */ - ad_ticks_per_sec = tick_resolution; + /* initialize how many times this module is called in one + * second (should be about every 100ms) + */ + ad_ticks_per_sec = tick_resolution; - bond_3ad_initiate_agg_selection(bond, - AD_AGGREGATOR_SELECTION_TIMER * - ad_ticks_per_sec); - } + bond_3ad_initiate_agg_selection(bond, + AD_AGGREGATOR_SELECTION_TIMER * + ad_ticks_per_sec); } /** diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 23b80aa171dd0..819f9df9425c6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -599,7 +599,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs, bool reset) hw_resc->max_stat_ctxs -= le16_to_cpu(req.min_stat_ctx) * n; hw_resc->max_vnics -= le16_to_cpu(req.min_vnics) * n; if (bp->flags & BNXT_FLAG_CHIP_P5) - hw_resc->max_irqs -= vf_msix * n; + hw_resc->max_nqs -= vf_msix; rc = pf->active_vfs; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 5733526fa245c..59963b901be0f 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -371,6 +371,19 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) bool if_running, pool_present = !!pool; int ret = 0, pool_failure = 0; + if (qid >= vsi->num_rxq || qid >= vsi->num_txq) { + netdev_err(vsi->netdev, "Please use queue id in scope of combined queues count\n"); + pool_failure = -EINVAL; + goto failure; + } + + if (!is_power_of_2(vsi->rx_rings[qid]->count) || + !is_power_of_2(vsi->tx_rings[qid]->count)) { + netdev_err(vsi->netdev, "Please align ring sizes to power of 2\n"); + pool_failure = -EINVAL; + goto failure; + } + if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi); if (if_running) { @@ -393,6 +406,7 @@ xsk_pool_if_up: netdev_err(vsi->netdev, "ice_qp_ena error = %d\n", ret); } +failure: if (pool_failure) { netdev_err(vsi->netdev, "Could not %sable buffer pool, error = %d\n", pool_present ? "en" : "dis", pool_failure); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index 22a874eee2e84..8b7f300355710 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -1211,7 +1211,6 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter) struct cyclecounter cc; unsigned long flags; u32 incval = 0; - u32 tsauxc = 0; u32 fuse0 = 0; /* For some of the boards below this mask is technically incorrect. @@ -1246,18 +1245,6 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter) case ixgbe_mac_x550em_a: case ixgbe_mac_X550: cc.read = ixgbe_ptp_read_X550; - - /* enable SYSTIME counter */ - IXGBE_WRITE_REG(hw, IXGBE_SYSTIMR, 0); - IXGBE_WRITE_REG(hw, IXGBE_SYSTIML, 0); - IXGBE_WRITE_REG(hw, IXGBE_SYSTIMH, 0); - tsauxc = IXGBE_READ_REG(hw, IXGBE_TSAUXC); - IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, - tsauxc & ~IXGBE_TSAUXC_DISABLE_SYSTIME); - IXGBE_WRITE_REG(hw, IXGBE_TSIM, IXGBE_TSIM_TXTS); - IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EIMS_TIMESYNC); - - IXGBE_WRITE_FLUSH(hw); break; case ixgbe_mac_X540: cc.read = ixgbe_ptp_read_82599; @@ -1289,6 +1276,50 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter) spin_unlock_irqrestore(&adapter->tmreg_lock, flags); } +/** + * ixgbe_ptp_init_systime - Initialize SYSTIME registers + * @adapter: the ixgbe private board structure + * + * Initialize and start the SYSTIME registers. + */ +static void ixgbe_ptp_init_systime(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + u32 tsauxc; + + switch (hw->mac.type) { + case ixgbe_mac_X550EM_x: + case ixgbe_mac_x550em_a: + case ixgbe_mac_X550: + tsauxc = IXGBE_READ_REG(hw, IXGBE_TSAUXC); + + /* Reset SYSTIME registers to 0 */ + IXGBE_WRITE_REG(hw, IXGBE_SYSTIMR, 0); + IXGBE_WRITE_REG(hw, IXGBE_SYSTIML, 0); + IXGBE_WRITE_REG(hw, IXGBE_SYSTIMH, 0); + + /* Reset interrupt settings */ + IXGBE_WRITE_REG(hw, IXGBE_TSIM, IXGBE_TSIM_TXTS); + IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EIMS_TIMESYNC); + + /* Activate the SYSTIME counter */ + IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, + tsauxc & ~IXGBE_TSAUXC_DISABLE_SYSTIME); + break; + case ixgbe_mac_X540: + case ixgbe_mac_82599EB: + /* Reset SYSTIME registers to 0 */ + IXGBE_WRITE_REG(hw, IXGBE_SYSTIML, 0); + IXGBE_WRITE_REG(hw, IXGBE_SYSTIMH, 0); + break; + default: + /* Other devices aren't supported */ + return; + }; + + IXGBE_WRITE_FLUSH(hw); +} + /** * ixgbe_ptp_reset * @adapter: the ixgbe private board structure @@ -1315,6 +1346,8 @@ void ixgbe_ptp_reset(struct ixgbe_adapter *adapter) ixgbe_ptp_start_cyclecounter(adapter); + ixgbe_ptp_init_systime(adapter); + spin_lock_irqsave(&adapter->tmreg_lock, flags); timecounter_init(&adapter->hw_tc, &adapter->hw_cc, ktime_to_ns(ktime_get_real())); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 304435e561170..b991f03c7e991 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -706,6 +706,8 @@ static void mlx5e_build_rep_params(struct net_device *netdev) params->num_tc = 1; params->tunneled_offload_en = false; + if (rep->vport != MLX5_VPORT_UPLINK) + params->vlan_strip_disable = true; mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c index 6137000b11c5c..73aac97fb5c96 100644 --- a/drivers/net/ethernet/moxa/moxart_ether.c +++ b/drivers/net/ethernet/moxa/moxart_ether.c @@ -74,11 +74,6 @@ static int moxart_set_mac_address(struct net_device *ndev, void *addr) static void moxart_mac_free_memory(struct net_device *ndev) { struct moxart_mac_priv_t *priv = netdev_priv(ndev); - int i; - - for (i = 0; i < RX_DESC_NUM; i++) - dma_unmap_single(&priv->pdev->dev, priv->rx_mapping[i], - priv->rx_buf_size, DMA_FROM_DEVICE); if (priv->tx_desc_base) dma_free_coherent(&priv->pdev->dev, @@ -193,6 +188,7 @@ static int moxart_mac_open(struct net_device *ndev) static int moxart_mac_stop(struct net_device *ndev) { struct moxart_mac_priv_t *priv = netdev_priv(ndev); + int i; napi_disable(&priv->napi); @@ -204,6 +200,11 @@ static int moxart_mac_stop(struct net_device *ndev) /* disable all functions */ writel(0, priv->base + REG_MAC_CTRL); + /* unmap areas mapped in moxart_mac_setup_desc_ring() */ + for (i = 0; i < RX_DESC_NUM; i++) + dma_unmap_single(&priv->pdev->dev, priv->rx_mapping[i], + priv->rx_buf_size, DMA_FROM_DEVICE); + return 0; } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index e14869a2e24a5..f60ffef33e0ce 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -378,8 +378,8 @@ try_again: ionic_opcode_to_str(opcode), opcode, ionic_error_to_str(err), err); - msleep(1000); iowrite32(0, &idev->dev_cmd_regs->done); + msleep(1000); iowrite32(1, &idev->dev_cmd_regs->doorbell); goto try_again; } @@ -392,6 +392,8 @@ try_again: return ionic_error_to_errno(err); } + ionic_dev_cmd_clean(ionic); + return 0; } diff --git a/drivers/net/ipa/ipa_mem.c b/drivers/net/ipa/ipa_mem.c index a78d66051a17d..25a8d029f2075 100644 --- a/drivers/net/ipa/ipa_mem.c +++ b/drivers/net/ipa/ipa_mem.c @@ -414,7 +414,7 @@ static int ipa_smem_init(struct ipa *ipa, u32 item, size_t size) } /* Align the address down and the size up to a page boundary */ - addr = qcom_smem_virt_to_phys(virt) & PAGE_MASK; + addr = qcom_smem_virt_to_phys(virt); phys = addr & PAGE_MASK; size = PAGE_ALIGN(size + addr - phys); iova = phys; /* We just want a direct mapping */ diff --git a/drivers/net/ipvlan/ipvtap.c b/drivers/net/ipvlan/ipvtap.c index 1cedb634f4f7b..f01078b2581ce 100644 --- a/drivers/net/ipvlan/ipvtap.c +++ b/drivers/net/ipvlan/ipvtap.c @@ -194,7 +194,7 @@ static struct notifier_block ipvtap_notifier_block __read_mostly = { .notifier_call = ipvtap_device_event, }; -static int ipvtap_init(void) +static int __init ipvtap_init(void) { int err; @@ -228,7 +228,7 @@ out1: } module_init(ipvtap_init); -static void ipvtap_exit(void) +static void __exit ipvtap_exit(void) { rtnl_link_unregister(&ipvtap_link_ops); unregister_netdevice_notifier(&ipvtap_notifier_block); diff --git a/drivers/nfc/pn533/uart.c b/drivers/nfc/pn533/uart.c index a0665d8ea85bc..e92535ebb5287 100644 --- a/drivers/nfc/pn533/uart.c +++ b/drivers/nfc/pn533/uart.c @@ -310,6 +310,7 @@ static void pn532_uart_remove(struct serdev_device *serdev) pn53x_unregister_nfc(pn532->priv); serdev_device_close(serdev); pn53x_common_clean(pn532->priv); + del_timer_sync(&pn532->cmd_timeout); kfree_skb(pn532->recv_skb); kfree(pn532); } diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index e20bcc835d6a8..82b658a3c220a 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -815,6 +815,7 @@ static int amd_gpio_suspend(struct device *dev) { struct amd_gpio *gpio_dev = dev_get_drvdata(dev); struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + unsigned long flags; int i; for (i = 0; i < desc->npins; i++) { @@ -823,7 +824,9 @@ static int amd_gpio_suspend(struct device *dev) if (!amd_gpio_should_save(gpio_dev, pin)) continue; - gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4); + raw_spin_lock_irqsave(&gpio_dev->lock, flags); + gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING; + raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } return 0; @@ -833,6 +836,7 @@ static int amd_gpio_resume(struct device *dev) { struct amd_gpio *gpio_dev = dev_get_drvdata(dev); struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + unsigned long flags; int i; for (i = 0; i < desc->npins; i++) { @@ -841,7 +845,10 @@ static int amd_gpio_resume(struct device *dev) if (!amd_gpio_should_save(gpio_dev, pin)) continue; - writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4); + raw_spin_lock_irqsave(&gpio_dev->lock, flags); + gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING; + writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4); + raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } return 0; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 0ee0b80006e05..7ac1090d4379c 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1997,7 +1997,7 @@ static int storvsc_probe(struct hv_device *device, */ host_dev->handle_error_wq = alloc_ordered_workqueue("storvsc_error_wq_%d", - WQ_MEM_RECLAIM, + 0, host->host_no); if (!host_dev->handle_error_wq) { ret = -ENOMEM; diff --git a/drivers/scsi/ufs/ufshci.h b/drivers/scsi/ufs/ufshci.h index 1d999228efc85..e380941318117 100644 --- a/drivers/scsi/ufs/ufshci.h +++ b/drivers/scsi/ufs/ufshci.h @@ -129,11 +129,7 @@ enum { #define UFSHCD_UIC_MASK (UIC_COMMAND_COMPL | UFSHCD_UIC_PWR_MASK) -#define UFSHCD_ERROR_MASK (UIC_ERROR |\ - DEVICE_FATAL_ERROR |\ - CONTROLLER_FATAL_ERROR |\ - SYSTEM_BUS_FATAL_ERROR |\ - CRYPTO_ENGINE_FATAL_ERROR) +#define UFSHCD_ERROR_MASK (UIC_ERROR | INT_FATAL_ERRORS) #define INT_FATAL_ERRORS (DEVICE_FATAL_ERROR |\ CONTROLLER_FATAL_ERROR |\ diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index fe8df32bb612b..cd5f2f09468e2 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -581,27 +581,30 @@ static int lock_pages( struct privcmd_dm_op_buf kbufs[], unsigned int num, struct page *pages[], unsigned int nr_pages, unsigned int *pinned) { - unsigned int i; + unsigned int i, off = 0; - for (i = 0; i < num; i++) { + for (i = 0; i < num; ) { unsigned int requested; int page_count; requested = DIV_ROUND_UP( offset_in_page(kbufs[i].uptr) + kbufs[i].size, - PAGE_SIZE); + PAGE_SIZE) - off; if (requested > nr_pages) return -ENOSPC; page_count = pin_user_pages_fast( - (unsigned long) kbufs[i].uptr, + (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, requested, FOLL_WRITE, pages); - if (page_count < 0) - return page_count; + if (page_count <= 0) + return page_count ? : -EFAULT; *pinned += page_count; nr_pages -= page_count; pages += page_count; + + off = (requested == page_count) ? 0 : off + page_count; + i += !off; } return 0; @@ -677,10 +680,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) } rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned); - if (rc < 0) { - nr_pages = pinned; + if (rc < 0) goto out; - } for (i = 0; i < kdata.num; i++) { set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); @@ -692,7 +693,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) xen_preemptible_hcall_end(); out: - unlock_pages(pages, nr_pages); + unlock_pages(pages, pinned); kfree(xbufs); kfree(pages); kfree(kbufs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d297804631829..be6935d191970 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -161,7 +161,7 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { btrfs_err(fs_info, - "replace devid present without an active replace item"); +"replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; } else { dev_replace->srcdev = NULL; @@ -954,8 +954,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) up_write(&dev_replace->rwsem); /* Scrub for replace must not be running in suspended state */ - ret = btrfs_scrub_cancel(fs_info); - ASSERT(ret != -ENOTCONN); + btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index db37a37996497..e9e8ca4e98a75 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -336,9 +336,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + err = ret; goto out; - if (ret == 0) { + } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f1a60bcdb3db8..cd6049b0bde53 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -389,6 +389,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, const char *name, const void *buffer, size_t size, int flags) { + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9fdecd9090493..70cd0d764c447 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -321,7 +321,7 @@ static int read_name_gen = 1; static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh, nfs4_stateid *stateid) { - struct nfs_fattr fattr; + struct nfs_fattr *fattr = nfs_alloc_fattr(); struct file *filep, *res; struct nfs_server *server; struct inode *r_ino = NULL; @@ -332,14 +332,20 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, server = NFS_SERVER(ss_mnt->mnt_root->d_inode); - nfs_fattr_init(&fattr); + if (!fattr) + return ERR_PTR(-ENOMEM); - status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL); + status = nfs4_proc_getattr(server, src_fh, fattr, NULL, NULL); if (status < 0) { res = ERR_PTR(status); goto out; } + if (!S_ISREG(fattr->mode)) { + res = ERR_PTR(-EBADF); + goto out; + } + res = ERR_PTR(-ENOMEM); len = strlen(SSC_READ_NAME_BODY) + 16; read_name = kzalloc(len, GFP_NOFS); @@ -347,7 +353,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out; snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); - r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr, + r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr, NULL); if (IS_ERR(r_ino)) { res = ERR_CAST(r_ino); @@ -358,6 +364,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, r_ino->i_fop); if (IS_ERR(filep)) { res = ERR_CAST(filep); + iput(r_ino); goto out_free_name; } filep->f_mode |= FMODE_READ; @@ -392,6 +399,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, out_free_name: kfree(read_name); out: + nfs_free_fattr(fattr); return res; out_stateowner: nfs4_put_state_owner(sp); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ba98371e9d164..ef18f0d71b11b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -503,10 +503,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false; + bool migration = false, young = false, dirty = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); + young = pte_young(*pte); + dirty = pte_dirty(*pte); } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); @@ -540,8 +542,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), - locked, migration); + smaps_account(mss, page, false, young, dirty, locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/fs/sync.c b/fs/sync.c index 1373a610dc784..79180e58d8628 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -21,25 +21,6 @@ #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) -/* - * Do the filesystem syncing work. For simple filesystems - * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to - * submit IO for these buffers via __sync_blockdev(). This also speeds up the - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ -static int __sync_filesystem(struct super_block *sb, int wait) -{ - if (wait) - sync_inodes_sb(sb); - else - writeback_inodes_sb(sb, WB_REASON_SYNC); - - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - /* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block @@ -47,7 +28,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) */ int sync_filesystem(struct super_block *sb) { - int ret; + int ret = 0; /* * We need to be protected against the filesystem going from @@ -61,10 +42,31 @@ int sync_filesystem(struct super_block *sb) if (sb_rdonly(sb)) return 0; - ret = __sync_filesystem(sb, 0); - if (ret < 0) + /* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have + * to submit I/O for these buffers via __sync_blockdev(). This also + * speeds up the wait == 1 case since in that case write_inode() + * methods call sync_dirty_buffer() and thus effectively write one block + * at a time. + */ + writeback_inodes_sb(sb, WB_REASON_SYNC); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 0); + if (ret) + return ret; + } + ret = __sync_blockdev(sb->s_bdev, 0); + if (ret) return ret; - return __sync_filesystem(sb, 1); + + sync_inodes_sb(sb); + if (sb->s_op->sync_fs) { + ret = sb->s_op->sync_fs(sb, 1); + if (ret) + return ret; + } + return __sync_blockdev(sb->s_bdev, 1); } EXPORT_SYMBOL(sync_filesystem); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 646735aad45df..103fa8381e7dc 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -371,7 +371,7 @@ int xfs_ioc_attr_list( struct xfs_inode *dp, void __user *ubuf, - int bufsize, + size_t bufsize, int flags, struct xfs_attrlist_cursor __user *ucursor) { @@ -1689,7 +1689,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index bab6a5a924077..416e20de66e7d 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -38,8 +38,9 @@ xfs_readlink_by_handle( int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, uint32_t opcode, void __user *uname, void __user *value, uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, - int flags, struct xfs_attrlist_cursor __user *ucursor); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 6323974d6b3e6..434c87cc9fbf5 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -757,6 +757,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; /* * Doing anything during the async pass would be counterproductive. @@ -764,7 +765,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1716,6 +1720,11 @@ xfs_remount_ro( }; int error; + /* Flush all the dirty data to disk. */ + error = sync_filesystem(mp->m_super); + if (error) + return error; + /* * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. @@ -1786,8 +1795,6 @@ xfs_fc_reconfigure( if (error) return error; - sync_filesystem(mp->m_super); - /* inode32 -> inode64 */ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d16302d3eb597..72f1e2a8c1670 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -114,7 +114,7 @@ static inline bool memory_contains(void *begin, void *end, void *virt, /** * memory_intersects - checks if the region occupied by an object intersects * with another memory region - * @begin: virtual address of the beginning of the memory regien + * @begin: virtual address of the beginning of the memory region * @end: virtual address of the end of the memory region * @virt: virtual address of the memory object * @size: size of the memory object @@ -127,7 +127,10 @@ static inline bool memory_intersects(void *begin, void *end, void *virt, { void *vend = virt + size; - return (virt >= begin && virt < end) || (vend >= begin && vend < end); + if (virt < end && vend > begin) + return true; + + return false; } /** diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ed2d531400051..6564fb4ac49e1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -633,9 +633,23 @@ extern int sysctl_devconf_inherit_init_net; */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return !IS_ENABLED(CONFIG_SYSCTL) || - !sysctl_fb_tunnels_only_for_init_net || - (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1); +#if IS_ENABLED(CONFIG_SYSCTL) + int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); + + return !fb_tunnels_only_for_init_net || + (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); +#else + return true; +#endif +} + +static inline int net_inherit_devconf(void) +{ +#if IS_ENABLED(CONFIG_SYSCTL) + return READ_ONCE(sysctl_devconf_inherit_init_net); +#else + return 0; +#endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 3a956145a25cb..a18fb73a2b772 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -94,10 +94,6 @@ struct ebt_table { struct ebt_replace_kernel *table; unsigned int valid_hooks; rwlock_t lock; - /* e.g. could be the table explicitly only allows certain - * matches, targets, ... 0 == let it in */ - int (*check)(const struct ebt_table_info *info, - unsigned int valid_hooks); /* the data used by the kernel */ struct ebt_table_info *private; struct module *me; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4e8425c1c5605..b055c217eb0be 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -542,10 +542,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_boosted tells if we are boosted due to DI. If so we are - * outside bandwidth enforcement mechanism (but only until we - * exit the critical section); - * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 716b7c5f6fdd9..36e5e75e71720 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -31,7 +31,7 @@ extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { - return sysctl_net_busy_poll; + return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b9948e7861f22..e66fee99ed3ea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -176,13 +176,18 @@ struct nft_ctx { bool report; }; +enum nft_data_desc_flags { + NFT_DATA_DESC_SETELEM = (1 << 0), +}; + struct nft_data_desc { enum nft_data_types type; + unsigned int size; unsigned int len; + unsigned int flags; }; -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index fd10a7862fdc6..ce75121782bf7 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -38,6 +38,14 @@ struct nft_cmp_fast_expr { bool inv; }; +struct nft_cmp16_fast_expr { + struct nft_data data; + struct nft_data mask; + u8 sreg; + u8 len; + bool inv; +}; + struct nft_immediate_expr { struct nft_data data; u8 dreg; @@ -55,6 +63,7 @@ static inline u32 nft_cmp_fast_mask(unsigned int len) } extern const struct nft_expr_ops nft_cmp_fast_ops; +extern const struct nft_expr_ops nft_cmp16_fast_ops; struct nft_payload { enum nft_payload_bases base:8; diff --git a/include/net/sock.h b/include/net/sock.h index 333131f47ac13..d31c2b9107e54 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2678,18 +2678,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); - return *proto->sysctl_wmem; + return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); - return *proto->sysctl_rmem; + return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 5b3f01da172bc..b2ebacd2f3097 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de636b7445b11..e4dcc23b52c01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5282,8 +5282,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -5293,10 +5292,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -5304,8 +5304,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa72..cdecd47e5580d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -268,6 +268,7 @@ COND_SYSCALL_COMPAT(keyctl); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/lib/ratelimit.c b/lib/ratelimit.c index e01a93f46f833..ce945c17980b9 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -26,10 +26,16 @@ */ int ___ratelimit(struct ratelimit_state *rs, const char *func) { + /* Paired with WRITE_ONCE() in .proc_handler(). + * Changing two values seperately could be inconsistent + * and some message could be lost. (See: net_ratelimit_state). + */ + int interval = READ_ONCE(rs->interval); + int burst = READ_ONCE(rs->burst); unsigned long flags; int ret; - if (!rs->interval) + if (!interval) return 1; /* @@ -44,7 +50,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (!rs->begin) rs->begin = jiffies; - if (time_is_before_jiffies(rs->begin + rs->interval)) { + if (time_is_before_jiffies(rs->begin + interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING @@ -56,7 +62,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; rs->printed = 0; } - if (rs->burst && rs->burst > rs->printed) { + if (burst && burst > rs->printed) { rs->printed++; ret = 1; } else { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 594368f6134f1..cb7b0aead7096 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1691,7 +1691,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2110,7 +2110,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); diff --git a/mm/mmap.c b/mm/mmap.c index a50042918cc7e..a1ee93f55cebb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1694,8 +1694,12 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; - /* Do we need to track softdirty? */ - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) && + !is_vm_hugetlb_page(vma)) return 1; /* Specialty mapping? */ diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 32bc2821027f3..57f91efce0f73 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -36,18 +36,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)&initial_chain, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~(1 << NF_BR_BROUTING)) - return -EINVAL; - return 0; -} - static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index bcf982e12f16b..7f2e620f4978f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~FILTER_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0d092773f8161..1743a105485c4 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~NAT_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index d481ff24a1501..310740cc684ad 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -999,8 +999,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - /* the table doesn't like it */ - if (t->check && (ret = t->check(newinfo, repl->valid_hooks))) + if (repl->valid_hooks != t->valid_hooks) goto free_unlock; if (repl->num_counters && repl->num_counters != t->private->nentries) { @@ -1186,11 +1185,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, if (ret != 0) goto free_chainstack; - if (table->check && table->check(newinfo, table->valid_hooks)) { - ret = -EINVAL; - goto free_chainstack; - } - table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 5f773624948ff..d67d06d6b817c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -15,18 +15,6 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - static struct bpf_local_storage_data * sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { @@ -316,7 +304,17 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - return omem_charge(owner, size); + int optmem_max = READ_ONCE(sysctl_optmem_max); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; } static void sk_storage_uncharge(struct bpf_local_storage_map *smap, diff --git a/net/core/dev.c b/net/core/dev.c index 637bc576fbd26..8355cc5e11a98 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4516,7 +4516,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (netdev_max_backlog >> 1)) + if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4564,7 +4564,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -4795,7 +4795,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5156,7 +5156,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5558,7 +5558,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5588,7 +5588,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -6371,7 +6371,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = dev_rx_weight; + napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; @@ -6879,8 +6879,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(netdev_budget_usecs); - int budget = netdev_budget; + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); + int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); diff --git a/net/core/filter.c b/net/core/filter.c index 815edf7bc4390..4c22e6d1da746 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1212,10 +1212,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); + int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ - if (filter_size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } @@ -1547,7 +1548,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > sysctl_optmem_max) + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1614,7 +1615,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { err = -ENOMEM; goto err_prog_put; } @@ -4713,14 +4714,14 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, /* Only some socketops are supported */ switch (optname) { case SO_RCVBUF: - val = min_t(u32, val, sysctl_rmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_rmem_max)); val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 6eb2e5ec2c506..2f66f3f295630 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -26,7 +26,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 48b6438f2a3d9..635cabcf8794f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4691,7 +4691,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(sysctl_tstamp_allow_data || tsonly)) + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) return true; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/core/sock.c b/net/core/sock.c index 6d9af4ef93d7a..1bb6a003323b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -887,7 +887,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. @@ -919,7 +919,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: @@ -2219,7 +2219,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - sysctl_optmem_max) + READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2237,8 +2237,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -2974,8 +2976,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); @@ -3030,7 +3032,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2e0a4378e778a..0dfe9f255ab3a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -235,14 +235,17 @@ static int set_default_qdisc(struct ctl_table *table, int write, static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index dc92a67baea39..7d542eb461729 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -480,8 +480,8 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; - sk->sk_sndbuf = sysctl_decnet_wmem[1]; - sk->sk_rcvbuf = sysctl_decnet_rmem[1]; + sk->sk_sndbuf = READ_ONCE(sysctl_decnet_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(sysctl_decnet_rmem[1]); /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 148ef484a66ce..8f17538755507 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2668,23 +2668,27 @@ static __net_init int devinet_init_net(struct net *net) #endif if (!net_eq(net, &init_net)) { - if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 3) { + switch (net_inherit_devconf()) { + case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); - } else if (!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) { - /* inherit == 0 or 1: copy from init_net */ + break; + case 0: + case 1: + /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + break; + case 2: + /* use compiled values */ + break; } - /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f77b0af3cb657..0dbf950de832f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1721,7 +1721,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 22507a6a3f71c..4cc39c62af55d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -773,7 +773,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > sysctl_optmem_max) + if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -808,7 +808,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > sysctl_optmem_max - 4) + if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -1231,7 +1231,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; - if (optlen > sysctl_optmem_max) { + if (optlen > READ_ONCE(sysctl_optmem_max)) { err = -ENOBUFS; break; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 78460eb39b3af..bfeb05f62b94f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,8 +451,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); - WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; @@ -1711,7 +1711,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else - cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d35e88b5ffcbe..41b44b311e8a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -425,7 +425,7 @@ static void tcp_sndbuf_expand(struct sock *sk) if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, - min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -454,12 +454,13 @@ static void tcp_sndbuf_expand(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, + unsigned int skbtruesize) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -471,7 +472,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing + * can play nice with us, as sk_buff and skb->head might be either + * freed or shared with up to MAX_SKB_FRAGS segments. + * Only give a boost to drivers using page frag(s) to hold the frame(s), + * and if no payload was pulled in skb->head before reaching us. + */ +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) +{ + u32 truesize = skb->truesize; + + if (adjust && !skb_headlen(skb)) { + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); + /* paranoid check, some drivers might be buggy */ + if (unlikely((int)truesize < (int)skb->len)) + truesize = skb->truesize; + } + return truesize; +} + +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, + bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; @@ -480,15 +501,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (room > 0 && !tcp_under_memory_pressure(sk)) { + unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); @@ -543,16 +565,17 @@ static void tcp_clamp_window(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); + int rmem2; icsk->icsk_ack.quick = 0; + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, - min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2])); + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -714,7 +737,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(rcvwin, tp->advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -782,7 +805,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -4761,7 +4784,7 @@ coalesce_done: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4849,7 +4872,7 @@ end: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4c9274cb92d55..48fce999dc612 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -238,8 +238,8 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); - space = max_t(u32, space, sysctl_rmem_max); + space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); + space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 05317e6f48f8a..ed1e5bfc97b31 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7042,9 +7042,8 @@ static int __net_init addrconf_init_net(struct net *net) if (!dflt) goto err_alloc_dflt; - if (IS_ENABLED(CONFIG_SYSCTL) && - !net_eq(net, &init_net)) { - switch (sysctl_devconf_inherit_init_net) { + if (!net_eq(net, &init_net)) { + switch (net_inherit_devconf()) { case 1: /* copy from init_net */ memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 43a894bf9a1be..6fa118bf40cdd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -208,7 +208,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > sysctl_optmem_max) + if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -242,7 +242,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > sysctl_optmem_max - 4) + if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); diff --git a/net/key/af_key.c b/net/key/af_key.c index 2aa16a171285b..05e2710988883 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1701,9 +1701,12 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad pfk->registered |= (1<sadb_msg_satype); } + mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); + mutex_unlock(&pfkey_mutex); + if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<sadb_msg_satype); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d0e91aa7b30e5..e61c85873ea2f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1439,7 +1439,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(rcvwin, advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; @@ -1872,8 +1872,8 @@ static int mptcp_init_sock(struct sock *sk) return ret; sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 16b48064f715e..daab857c52a80 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1280,12 +1280,12 @@ static void set_sock_size(struct sock *sk, int mode, int val) lock_sock(sk); if (mode) { val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, - sysctl_wmem_max); + READ_ONCE(sysctl_wmem_max)); sk->sk_sndbuf = val * 2; sk->sk_userlocks |= SOCK_SNDBUF_LOCK; } else { val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, - sysctl_rmem_max); + READ_ONCE(sysctl_rmem_max)); sk->sk_rcvbuf = val * 2; sk->sk_userlocks |= SOCK_RCVBUF_LOCK; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 30bd4b867912c..1b039476e4d6a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1999,9 +1999,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { const struct nlattr * const *nla = ctx->nla; + struct nft_stats __percpu *stats = NULL; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; - struct nft_stats __percpu *stats; struct net *net = ctx->net; char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; @@ -2037,7 +2037,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return PTR_ERR(stats); } rcu_assign_pointer(basechain->stats, stats); - static_branch_inc(&nft_counters_enabled); } err = nft_basechain_init(basechain, family, &hook, flags); @@ -2120,6 +2119,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } + if (stats) + static_branch_inc(&nft_counters_enabled); + table->use++; return 0; @@ -4839,19 +4841,13 @@ static int nft_setelem_parse_flags(const struct nft_set *set, static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *key, struct nlattr *attr) { - struct nft_data_desc desc; - int err; - - err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); - if (err < 0) - return err; - - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(key, desc.type); - return -EINVAL; - } + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = NFT_DATA_VALUE_MAXLEN, + .len = set->klen, + }; - return 0; + return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, @@ -4860,24 +4856,18 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nlattr *attr) { u32 dtype; - int err; - - err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); - if (err < 0) - return err; if (set->dtype == NFT_DATA_VERDICT) dtype = NFT_DATA_VERDICT; else dtype = NFT_DATA_VALUE; - if (dtype != desc->type || - set->dlen != desc->len) { - nft_data_release(data, desc->type); - return -EINVAL; - } + desc->type = dtype; + desc->size = NFT_DATA_VALUE_MAXLEN; + desc->len = set->dlen; + desc->flags = NFT_DATA_DESC_SETELEM; - return 0; + return nft_data_init(ctx, data, desc, attr); } static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, @@ -8688,6 +8678,11 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; + if (nft_chain_is_bound(chain)) + return -EINVAL; + if (desc->flags & NFT_DATA_DESC_SETELEM && + chain->flags & NFT_CHAIN_BINDING) + return -EINVAL; chain->use++; data->verdict.chain = chain; @@ -8695,7 +8690,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, } desc->len = sizeof(data->verdict); - desc->type = NFT_DATA_VERDICT; + return 0; } @@ -8748,20 +8743,25 @@ nla_put_failure: } static int nft_value_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, - struct nft_data_desc *desc, const struct nlattr *nla) + struct nft_data *data, struct nft_data_desc *desc, + const struct nlattr *nla) { unsigned int len; len = nla_len(nla); if (len == 0) return -EINVAL; - if (len > size) + if (len > desc->size) return -EOVERFLOW; + if (desc->len) { + if (len != desc->len) + return -EINVAL; + } else { + desc->len = len; + } nla_memcpy(data->data, nla, len); - desc->type = NFT_DATA_VALUE; - desc->len = len; + return 0; } @@ -8781,7 +8781,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * * @ctx: context of the expression using the data * @data: destination struct nft_data - * @size: maximum data length * @desc: data description * @nla: netlink attribute containing data * @@ -8791,24 +8790,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; + if (WARN_ON_ONCE(!desc->size)) + return -EINVAL; + err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; - if (tb[NFTA_DATA_VALUE]) - return nft_value_init(ctx, data, size, desc, - tb[NFTA_DATA_VALUE]); - if (tb[NFTA_DATA_VERDICT] && ctx != NULL) - return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); - return -EINVAL; + if (tb[NFTA_DATA_VALUE]) { + if (desc->type != NFT_DATA_VALUE) + return -EINVAL; + + err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) { + if (desc->type != NFT_DATA_VERDICT) + return -EINVAL; + + err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + } else { + err = -EINVAL; + } + + return err; } EXPORT_SYMBOL_GPL(nft_data_init); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a61b5bf5aa0fb..9dc18429ed875 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -67,6 +67,50 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static void nft_cmp16_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + const u64 *reg_data = (const u64 *)®s->data[priv->sreg]; + const u64 *mask = (const u64 *)&priv->mask; + const u64 *data = (const u64 *)&priv->data; + + if (((reg_data[0] & mask[0]) == data[0] && + ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv) + return; + regs->verdict.code = NFT_BREAK; +} + +static noinline void __nft_trace_verdict(struct nft_traceinfo *info, + const struct nft_chain *chain, + const struct nft_regs *regs) +{ + enum nft_trace_types type; + + switch (regs->verdict.code) { + case NFT_CONTINUE: + case NFT_RETURN: + type = NFT_TRACETYPE_RETURN; + break; + default: + type = NFT_TRACETYPE_RULE; + break; + } + + __nft_trace_packet(info, chain, type); +} + +static inline void nft_trace_verdict(struct nft_traceinfo *info, + const struct nft_chain *chain, + const struct nft_rule *rule, + const struct nft_regs *regs) +{ + if (static_branch_unlikely(&nft_trace_enabled)) { + info->rule = rule; + __nft_trace_verdict(info, chain, regs); + } +} + static bool nft_payload_fast_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -185,6 +229,8 @@ next_rule: nft_rule_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); + else if (expr->ops == &nft_cmp16_fast_ops) + nft_cmp16_fast_eval(expr, ®s); else if (expr->ops == &nft_bitwise_fast_ops) nft_bitwise_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || @@ -207,13 +253,13 @@ next_rule: break; } + nft_trace_verdict(&info, chain, rule, ®s); + switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: case NF_STOLEN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); return regs.verdict.code; } @@ -226,15 +272,10 @@ next_rule: stackptr++; fallthrough; case NFT_GOTO: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RULE); - chain = regs.verdict.chain; goto do_chain; case NFT_CONTINUE: case NFT_RETURN: - nft_trace_packet(&info, chain, rule, - NFT_TRACETYPE_RETURN); break; default: WARN_ON(1); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 47b0dba95054f..d6ab7aa14adc2 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { static int nft_bitwise_init_bool(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc mask, xor; + struct nft_data_desc mask = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->mask), + .len = priv->len, + }; + struct nft_data_desc xor = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->xor), + .len = priv->len, + }; int err; if (tb[NFTA_BITWISE_DATA]) @@ -103,36 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, - tb[NFTA_BITWISE_MASK]); + err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { - err = -EINVAL; - goto err1; - } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, - tb[NFTA_BITWISE_XOR]); + err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) - goto err1; - if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { - err = -EINVAL; - goto err2; - } + goto err_xor_err; return 0; -err2: - nft_data_release(&priv->xor, xor.type); -err1: + +err_xor_err: nft_data_release(&priv->mask, mask.type); + return err; } static int nft_bitwise_init_shift(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = sizeof(u32), + }; int err; if (tb[NFTA_BITWISE_MASK] || @@ -142,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, if (!tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, - tb[NFTA_BITWISE_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]); if (err < 0) return err; - if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || - priv->data.data[0] >= BITS_PER_TYPE(u32)) { - nft_data_release(&priv->data, d.type); + + if (priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, desc.type); return -EINVAL; } @@ -290,22 +292,21 @@ static const struct nft_expr_ops nft_bitwise_ops = { static int nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out) { - struct nft_data_desc desc; struct nft_data data; - int err = 0; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + .len = sizeof(u32), + }; + int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, tb); + err = nft_data_init(NULL, &data, &desc, tb); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) { - err = -EINVAL; - goto err; - } *out = data.data[0]; -err: - nft_data_release(&data, desc.type); - return err; + + return 0; } static int nft_bitwise_fast_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index b529c0e865466..461763a571f20 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -73,20 +73,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; int err; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - nft_data_release(&priv->data, desc.type); - return err; - } - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -201,12 +197,14 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; @@ -272,12 +270,108 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .offload = nft_cmp_fast_offload, }; +static u32 nft_cmp_mask(u32 bitlen) +{ + return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen)); +} + +static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen) +{ + int len = bitlen / BITS_PER_BYTE; + int i, words = len / sizeof(u32); + + for (i = 0; i < words; i++) { + data->data[i] = 0xffffffff; + bitlen -= sizeof(u32) * BITS_PER_BYTE; + } + + if (len % sizeof(u32)) + data->data[i++] = nft_cmp_mask(bitlen); + + for (; i < 4; i++) + data->data[i] = 0; +} + +static int nft_cmp16_fast_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; + int err; + + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); + if (err < 0) + return err; + + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + if (err < 0) + return err; + + nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE); + priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ; + priv->len = desc.len; + + return 0; +} + +static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + struct nft_cmp_expr cmp = { + .data = priv->data, + .sreg = priv->sreg, + .len = priv->len, + .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ, + }; + + return __nft_cmp_offload(ctx, flow, &cmp); +} + +static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); + enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; + + if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + + +const struct nft_expr_ops nft_cmp16_fast_ops = { + .type = &nft_cmp_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)), + .eval = NULL, /* inlined */ + .init = nft_cmp16_fast_init, + .dump = nft_cmp16_fast_dump, + .offload = nft_cmp16_fast_offload, +}; + static const struct nft_expr_ops * nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; enum nft_cmp_ops op; + u8 sreg; int err; if (tb[NFTA_CMP_SREG] == NULL || @@ -298,23 +392,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return ERR_PTR(-EINVAL); } - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ)) - return &nft_cmp_fast_ops; + sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); + if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) { + if (desc.len <= sizeof(u32)) + return &nft_cmp_fast_ops; + else if (desc.len <= sizeof(data) && + ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) || + (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0))) + return &nft_cmp16_fast_ops; + } return &nft_cmp_ops; -err1: - nft_data_release(&data, desc.type); - return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index d0f67d325bdfd..fcdbc5ed3f367 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,20 +29,36 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; +static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) +{ + enum nft_data_types type; + u8 reg; + + reg = ntohl(nla_get_be32(nla)); + if (reg == NFT_REG_VERDICT) + type = NFT_DATA_VERDICT; + else + type = NFT_DATA_VALUE; + + return type; +} + static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .size = sizeof(priv->data), + }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; - err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_IMMEDIATE_DATA]); + desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index d82677e83400b..720dc9fba6d4f 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -115,9 +115,21 @@ static int nft_osf_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { - return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_FORWARD)); + unsigned int hooks; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); } static struct nft_expr_type nft_osf_type; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 01878c16418c2..551e0d6cf63d4 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -660,17 +660,23 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_payload_set *priv = nft_expr_priv(expr); + u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; + int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); if (tb[NFTA_PAYLOAD_CSUM_TYPE]) - priv->csum_type = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); - if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) - priv->csum_offset = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX, + &csum_offset); + if (err < 0) + return err; + + priv->csum_offset = csum_offset; + } if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { u32 flags; @@ -681,7 +687,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->csum_flags = flags; } - switch (priv->csum_type) { + switch (csum_type) { case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; @@ -695,6 +701,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, default: return -EOPNOTSUPP; } + priv->csum_type = csum_type; return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); @@ -733,6 +740,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, { enum nft_payload_bases base; unsigned int offset, len; + int err; if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || @@ -758,8 +766,13 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + if (err < 0) + return ERR_PTR(err); + + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len); + if (err < 0) + return ERR_PTR(err); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER) diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index e4a1c44d7f513..e6bbe32c323df 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr const struct nlattr * const tb[]) { struct nft_range_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc_from, desc_to; + struct nft_data_desc desc_from = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_from), + }; + struct nft_data_desc desc_to = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_to), + }; int err; u32 op; @@ -61,26 +68,16 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr !tb[NFTA_RANGE_TO_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), - &desc_from, tb[NFTA_RANGE_FROM_DATA]); + err = nft_data_init(NULL, &priv->data_from, &desc_from, + tb[NFTA_RANGE_FROM_DATA]); if (err < 0) return err; - if (desc_from.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), - &desc_to, tb[NFTA_RANGE_TO_DATA]); + err = nft_data_init(NULL, &priv->data_to, &desc_to, + tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; - if (desc_to.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err2; - } - if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3b27926d5382c..2ee50996da8cc 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -133,6 +133,7 @@ static const struct nft_expr_ops nft_tunnel_get_ops = { static struct nft_expr_type nft_tunnel_type __read_mostly = { .name = "tunnel", + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_get_ops, .policy = nft_tunnel_policy, .maxattr = NFTA_TUNNEL_MAX, diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 11c45c8c6c164..036d92c0ad794 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -96,7 +96,8 @@ static void rose_loopback_timer(struct timer_list *unused) } if (frametype == ROSE_CALL_REQUEST) { - if (!rose_loopback_neigh->dev) { + if (!rose_loopback_neigh->dev && + !rose_loopback_neigh->loopback) { kfree_skb(skb); continue; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 043508fd8d8a5..150cd7b2154c8 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -285,8 +285,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, p->user_call_ID); limiter = rxrpc_get_call_slot(p, gfp); - if (!limiter) + if (!limiter) { + release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); + } call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); if (IS_ERR(call)) { diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index aa23ba4e25662..eef3c14fd1c18 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -51,10 +51,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, return sock_intr_errno(*timeo); trace_rxrpc_transmit(call, rxrpc_transmit_wait); - mutex_unlock(&call->user_mutex); *timeo = schedule_timeout(*timeo); - if (mutex_lock_interruptible(&call->user_mutex) < 0) - return sock_intr_errno(*timeo); } } @@ -290,37 +287,48 @@ out: static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, - rxrpc_notify_end_tx_t notify_end_tx) + rxrpc_notify_end_tx_t notify_end_tx, + bool *_dropped_lock) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; struct sock *sk = &rx->sk; + enum rxrpc_call_state state; long timeo; - bool more; - int ret, copied; + bool more = msg->msg_flags & MSG_MORE; + int ret, copied = 0; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); +reload: + ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) - return -EPIPE; - - more = msg->msg_flags & MSG_MORE; - + goto maybe_error; + state = READ_ONCE(call->state); + ret = -ESHUTDOWN; + if (state >= RXRPC_CALL_COMPLETE) + goto maybe_error; + ret = -EPROTO; + if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && + state != RXRPC_CALL_SERVER_ACK_REQUEST && + state != RXRPC_CALL_SERVER_SEND_REPLY) + goto maybe_error; + + ret = -EMSGSIZE; if (call->tx_total_len != -1) { - if (len > call->tx_total_len) - return -EMSGSIZE; - if (!more && len != call->tx_total_len) - return -EMSGSIZE; + if (len - copied > call->tx_total_len) + goto maybe_error; + if (!more && len - copied != call->tx_total_len) + goto maybe_error; } skb = call->tx_pending; call->tx_pending = NULL; rxrpc_see_skb(skb, rxrpc_skb_seen); - copied = 0; do { /* Check to see if there's a ping ACK to reply to. */ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) @@ -331,16 +339,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (!rxrpc_check_tx_space(call, NULL)) { - ret = -EAGAIN; - if (msg->msg_flags & MSG_DONTWAIT) - goto maybe_error; - ret = rxrpc_wait_for_tx_window(rx, call, - &timeo, - msg->msg_flags & MSG_WAITALL); - if (ret < 0) - goto maybe_error; - } + if (!rxrpc_check_tx_space(call, NULL)) + goto wait_for_space; max = RXRPC_JUMBO_DATALEN; max -= call->conn->security_size; @@ -485,6 +485,27 @@ maybe_error: efault: ret = -EFAULT; goto out; + +wait_for_space: + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + mutex_unlock(&call->user_mutex); + *_dropped_lock = true; + ret = rxrpc_wait_for_tx_window(rx, call, &timeo, + msg->msg_flags & MSG_WAITALL); + if (ret < 0) + goto maybe_error; + if (call->interruptibility == RXRPC_INTERRUPTIBLE) { + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + ret = sock_intr_errno(timeo); + goto maybe_error; + } + } else { + mutex_lock(&call->user_mutex); + } + *_dropped_lock = false; + goto reload; } /* @@ -646,6 +667,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) enum rxrpc_call_state state; struct rxrpc_call *call; unsigned long now, j; + bool dropped_lock = false; int ret; struct rxrpc_send_params p = { @@ -754,21 +776,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_abort_packet(call); } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; - } else if (rxrpc_is_client_call(call) && - state != RXRPC_CALL_CLIENT_SEND_REQUEST) { - /* request phase complete for this client call */ - ret = -EPROTO; - } else if (rxrpc_is_service_call(call) && - state != RXRPC_CALL_SERVER_ACK_REQUEST && - state != RXRPC_CALL_SERVER_SEND_REPLY) { - /* Reply phase not begun or not complete for service call. */ - ret = -EPROTO; } else { - ret = rxrpc_send_data(rx, call, msg, len, NULL); + ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); } out_put_unlock: - mutex_unlock(&call->user_mutex); + if (!dropped_lock) + mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); @@ -796,6 +810,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { + bool dropped_lock = false; int ret; _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); @@ -813,7 +828,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, case RXRPC_CALL_SERVER_ACK_REQUEST: case RXRPC_CALL_SERVER_SEND_REPLY: ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, - notify_end_tx); + notify_end_tx, &dropped_lock); break; case RXRPC_CALL_COMPLETE: read_lock_bh(&call->state_lock); @@ -827,7 +842,8 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, break; } - mutex_unlock(&call->user_mutex); + if (!dropped_lock) + mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5d5391adb667c..68f1e89430b3b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -403,7 +403,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = dev_tx_weight; + int quota = READ_ONCE(dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { diff --git a/net/socket.c b/net/socket.c index d52c265ad449b..bcf68b150fe29 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1670,7 +1670,7 @@ int __sys_listen(int fd, int backlog) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c5af31312e0cf..78c6648af7827 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1867,7 +1867,7 @@ call_encode(struct rpc_task *task) break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 38256aabf4f1d..8f3c9fbb99165 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -504,7 +504,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; - sk->sk_rcvbuf = sysctl_tipc_rmem[1]; + sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]); sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 1f08ebf7d80c5..24ca49ecebea3 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -170,7 +170,7 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog) + if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 61e6220ddd5ae..77e82033ad700 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -782,7 +782,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 603b05ed7eb4c..0d12bdf59d4cc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3164,7 +3164,7 @@ ok: return dst; nopol: - if (!(dst_orig->dev->flags & IFF_LOOPBACK) && + if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; @@ -3641,6 +3641,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + xfrm_pol_put(pols[0]); return 0; } pols[1]->curlft.use_time = ktime_get_real_seconds(); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bc0bbb1571cef..fdbd56ed4bd52 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1557,6 +1557,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->replay = orig->replay; x->preplay = orig->preplay; x->mapping_maxage = orig->mapping_maxage; + x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5ee3c4d1fbb2b..3e7706c251e9e 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -248,7 +248,7 @@ endif # defined. get-executable-or-default fails with an error if the first argument is supplied but # doesn't exist. override PYTHON_CONFIG := $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON_AUTO)) -override PYTHON := $(call get-executable-or-default,PYTHON,$(subst -config,,$(PYTHON_AUTO))) +override PYTHON := $(call get-executable-or-default,PYTHON,$(subst -config,,$(PYTHON_CONFIG))) grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1))