From f79a0c28d01f3cfa305fb17b094fe4252c69ba6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 30 Nov 2018 07:47:05 +0100 Subject: [PATCH 01/54] spi: imx: add a device specific prepare_message callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e697271c4e2987b333148e16a2eb8b5b924fd40a ] This is just preparatory work which allows to move some initialisation that currently is done in the per transfer hook .config to an earlier point in time in the next few patches. There is no change in behaviour introduced by this patch. Signed-off-by: Uwe Kleine-König Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-imx.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 1ad4b69292ad..eb27f47578eb 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -59,6 +59,7 @@ struct spi_imx_data; struct spi_imx_devtype_data { void (*intctrl)(struct spi_imx_data *, int); + int (*prepare_message)(struct spi_imx_data *, struct spi_message *); int (*config)(struct spi_device *); void (*trigger)(struct spi_imx_data *); int (*rx_available)(struct spi_imx_data *); @@ -502,6 +503,12 @@ static void mx51_ecspi_disable(struct spi_imx_data *spi_imx) writel(ctrl, spi_imx->base + MX51_ECSPI_CTRL); } +static int mx51_ecspi_prepare_message(struct spi_imx_data *spi_imx, + struct spi_message *msg) +{ + return 0; +} + static int mx51_ecspi_config(struct spi_device *spi) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); @@ -672,6 +679,12 @@ static void mx31_trigger(struct spi_imx_data *spi_imx) writel(reg, spi_imx->base + MXC_CSPICTRL); } +static int mx31_prepare_message(struct spi_imx_data *spi_imx, + struct spi_message *msg) +{ + return 0; +} + static int mx31_config(struct spi_device *spi) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); @@ -768,6 +781,12 @@ static void mx21_trigger(struct spi_imx_data *spi_imx) writel(reg, spi_imx->base + MXC_CSPICTRL); } +static int mx21_prepare_message(struct spi_imx_data *spi_imx, + struct spi_message *msg) +{ + return 0; +} + static int mx21_config(struct spi_device *spi) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); @@ -837,6 +856,12 @@ static void mx1_trigger(struct spi_imx_data *spi_imx) writel(reg, spi_imx->base + MXC_CSPICTRL); } +static int mx1_prepare_message(struct spi_imx_data *spi_imx, + struct spi_message *msg) +{ + return 0; +} + static int mx1_config(struct spi_device *spi) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); @@ -871,6 +896,7 @@ static void mx1_reset(struct spi_imx_data *spi_imx) static struct spi_imx_devtype_data imx1_cspi_devtype_data = { .intctrl = mx1_intctrl, + .prepare_message = mx1_prepare_message, .config = mx1_config, .trigger = mx1_trigger, .rx_available = mx1_rx_available, @@ -884,6 +910,7 @@ static struct spi_imx_devtype_data imx1_cspi_devtype_data = { static struct spi_imx_devtype_data imx21_cspi_devtype_data = { .intctrl = mx21_intctrl, + .prepare_message = mx21_prepare_message, .config = mx21_config, .trigger = mx21_trigger, .rx_available = mx21_rx_available, @@ -898,6 +925,7 @@ static struct spi_imx_devtype_data imx21_cspi_devtype_data = { static struct spi_imx_devtype_data imx27_cspi_devtype_data = { /* i.mx27 cspi shares the functions with i.mx21 one */ .intctrl = mx21_intctrl, + .prepare_message = mx21_prepare_message, .config = mx21_config, .trigger = mx21_trigger, .rx_available = mx21_rx_available, @@ -911,6 +939,7 @@ static struct spi_imx_devtype_data imx27_cspi_devtype_data = { static struct spi_imx_devtype_data imx31_cspi_devtype_data = { .intctrl = mx31_intctrl, + .prepare_message = mx31_prepare_message, .config = mx31_config, .trigger = mx31_trigger, .rx_available = mx31_rx_available, @@ -925,6 +954,7 @@ static struct spi_imx_devtype_data imx31_cspi_devtype_data = { static struct spi_imx_devtype_data imx35_cspi_devtype_data = { /* i.mx35 and later cspi shares the functions with i.mx31 one */ .intctrl = mx31_intctrl, + .prepare_message = mx31_prepare_message, .config = mx31_config, .trigger = mx31_trigger, .rx_available = mx31_rx_available, @@ -938,6 +968,7 @@ static struct spi_imx_devtype_data imx35_cspi_devtype_data = { static struct spi_imx_devtype_data imx51_ecspi_devtype_data = { .intctrl = mx51_ecspi_intctrl, + .prepare_message = mx51_ecspi_prepare_message, .config = mx51_ecspi_config, .trigger = mx51_ecspi_trigger, .rx_available = mx51_ecspi_rx_available, @@ -952,6 +983,7 @@ static struct spi_imx_devtype_data imx51_ecspi_devtype_data = { static struct spi_imx_devtype_data imx53_ecspi_devtype_data = { .intctrl = mx51_ecspi_intctrl, + .prepare_message = mx51_ecspi_prepare_message, .config = mx51_ecspi_config, .trigger = mx51_ecspi_trigger, .rx_available = mx51_ecspi_rx_available, @@ -1486,7 +1518,13 @@ spi_imx_prepare_message(struct spi_master *master, struct spi_message *msg) return ret; } - return 0; + ret = spi_imx->devtype_data->prepare_message(spi_imx, msg); + if (ret) { + clk_disable(spi_imx->clk_ipg); + clk_disable(spi_imx->clk_per); + } + + return ret; } static int From ab6d0ef9d7e91f550aa112595a3eb0194a24dd4c Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Wed, 10 Oct 2018 10:32:42 +0000 Subject: [PATCH 02/54] spi: imx: move wml setting to later than setup_transfer [ Upstream commit 987a2dfe3f0485a82d87106e7e1c43f35c1d3b09 ] Current dynamic burst length is based on the whole transfer length, that's ok if there is only one sg, but is not right in case multi sgs in one transfer,because the tail data should be based on the last sg length instead of the whole transfer length. Move wml setting for DMA to the later place, thus, the next patch could get the right last sg length for wml setting. This patch is a preparation one, no any function change involved. Signed-off-by: Robin Gong Signed-off-by: Mark Brown Stable-dep-of: 00b80ac93553 ("spi: imx: mx51-ecspi: Move some initialisation to prepare_message hook.") Signed-off-by: Sasha Levin --- drivers/spi/spi-imx.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index eb27f47578eb..686251e05edf 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -64,6 +64,7 @@ struct spi_imx_devtype_data { void (*trigger)(struct spi_imx_data *); int (*rx_available)(struct spi_imx_data *); void (*reset)(struct spi_imx_data *); + void (*setup_wml)(struct spi_imx_data *); void (*disable)(struct spi_imx_data *); bool has_dmamode; bool has_slavemode; @@ -601,6 +602,11 @@ static int mx51_ecspi_config(struct spi_device *spi) else /* SCLK is _very_ slow */ usleep_range(delay, delay + 10); + return 0; +} + +static void mx51_setup_wml(struct spi_imx_data *spi_imx) +{ /* * Configure the DMA register: setup the watermark * and enable DMA request. @@ -611,8 +617,6 @@ static int mx51_ecspi_config(struct spi_device *spi) MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) | MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN | MX51_ECSPI_DMA_RXTDEN, spi_imx->base + MX51_ECSPI_DMA); - - return 0; } static int mx51_ecspi_rx_available(struct spi_imx_data *spi_imx) @@ -973,6 +977,7 @@ static struct spi_imx_devtype_data imx51_ecspi_devtype_data = { .trigger = mx51_ecspi_trigger, .rx_available = mx51_ecspi_rx_available, .reset = mx51_ecspi_reset, + .setup_wml = mx51_setup_wml, .fifo_size = 64, .has_dmamode = true, .dynamic_burst = true, @@ -1181,7 +1186,6 @@ static int spi_imx_setupxfer(struct spi_device *spi, struct spi_transfer *t) { struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); - int ret; if (!t) return 0; @@ -1222,12 +1226,6 @@ static int spi_imx_setupxfer(struct spi_device *spi, else spi_imx->usedma = 0; - if (spi_imx->usedma) { - ret = spi_imx_dma_configure(spi->master); - if (ret) - return ret; - } - if (is_imx53_ecspi(spi_imx) && spi_imx->slave_mode) { spi_imx->rx = mx53_ecspi_rx_slave; spi_imx->tx = mx53_ecspi_tx_slave; @@ -1332,6 +1330,13 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, unsigned long timeout; struct spi_master *master = spi_imx->bitbang.master; struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg; + int ret; + + ret = spi_imx_dma_configure(master); + if (ret) + return ret; + + spi_imx->devtype_data->setup_wml(spi_imx); /* * The TX DMA setup starts the transfer, so make sure RX is configured From 22dae4676366d6bce7ac87652b60f55de2777ec4 Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Wed, 10 Oct 2018 10:32:45 +0000 Subject: [PATCH 03/54] spi: imx: correct wml as the last sg length [ Upstream commit 5ba5a3730639caddf42af11c60f3f3d99d9a5f00 ] Correct wml as the last rx sg length instead of the whole transfer length. Otherwise, mtd_stresstest will be failed as below: insmod mtd_stresstest.ko dev=0 ================================================= mtd_stresstest: MTD device: 0 mtd_stresstest: not NAND flash, assume page size is 512 bytes. mtd_stresstest: MTD device size 4194304, eraseblock size 65536, page size 512, count of eraseblocks 64, pa0 mtd_stresstest: doing operations mtd_stresstest: 0 operations done mtd_test: mtd_read from 1ff532, size 880 mtd_test: mtd_read from 20c267, size 64998 spi_master spi0: I/O Error in DMA RX m25p80 spi0.0: SPI transfer failed: -110 spi_master spi0: failed to transfer one message from queue mtd_test: error: read failed at 0x20c267 mtd_stresstest: error -110 occurred ================================================= insmod: ERROR: could not insert module mtd_stresstest.ko: Connection timed out Signed-off-by: Robin Gong Signed-off-by: Mark Brown Stable-dep-of: 00b80ac93553 ("spi: imx: mx51-ecspi: Move some initialisation to prepare_message hook.") Signed-off-by: Sasha Levin --- drivers/spi/spi-imx.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 686251e05edf..139127e27a14 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -218,7 +218,6 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi, struct spi_transfer *transfer) { struct spi_imx_data *spi_imx = spi_master_get_devdata(master); - unsigned int bytes_per_word, i; if (!master->dma_rx) return false; @@ -226,14 +225,6 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi, if (spi_imx->slave_mode) return false; - bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word); - - for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) { - if (!(transfer->len % (i * bytes_per_word))) - break; - } - - spi_imx->wml = i; spi_imx->dynamic_burst = 0; return true; @@ -612,7 +603,7 @@ static void mx51_setup_wml(struct spi_imx_data *spi_imx) * and enable DMA request. */ - writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml) | + writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml - 1) | MX51_ECSPI_DMA_TX_WML(spi_imx->wml) | MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) | MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN | @@ -1330,12 +1321,30 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, unsigned long timeout; struct spi_master *master = spi_imx->bitbang.master; struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg; + struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents); + unsigned int bytes_per_word, i; int ret; + /* Get the right burst length from the last sg to ensure no tail data */ + bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word); + for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) { + if (!(sg_dma_len(last_sg) % (i * bytes_per_word))) + break; + } + /* Use 1 as wml in case no available burst length got */ + if (i == 0) + i = 1; + + spi_imx->wml = i; + ret = spi_imx_dma_configure(master); if (ret) return ret; + if (!spi_imx->devtype_data->setup_wml) { + dev_err(spi_imx->dev, "No setup_wml()?\n"); + return -EINVAL; + } spi_imx->devtype_data->setup_wml(spi_imx); /* From aca301ece521d5804a442248cc7da8754d537b18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 30 Nov 2018 07:47:06 +0100 Subject: [PATCH 04/54] spi: imx: mx51-ecspi: Move some initialisation to prepare_message hook. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 00b80ac9355397455adec24c9ee76f1b0225cd27 ] The relevant difference between prepare_message and config is that the former is run before the CS signal is asserted. So the polarity of the CLK line must be configured in prepare_message as an edge generated by config might already result in a latch of the MOSI line. Signed-off-by: Uwe Kleine-König Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-imx.c | 67 ++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 139127e27a14..0078cb365d8c 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -498,14 +498,9 @@ static void mx51_ecspi_disable(struct spi_imx_data *spi_imx) static int mx51_ecspi_prepare_message(struct spi_imx_data *spi_imx, struct spi_message *msg) { - return 0; -} - -static int mx51_ecspi_config(struct spi_device *spi) -{ - struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); + struct spi_device *spi = msg->spi; u32 ctrl = MX51_ECSPI_CTRL_ENABLE; - u32 clk = spi_imx->speed_hz, delay, reg; + u32 testreg; u32 cfg = readl(spi_imx->base + MX51_ECSPI_CONFIG); /* set Master or Slave mode */ @@ -520,19 +515,21 @@ static int mx51_ecspi_config(struct spi_device *spi) if (spi->mode & SPI_READY) ctrl |= MX51_ECSPI_CTRL_DRCTL(spi_imx->spi_drctl); - /* set clock speed */ - ctrl |= mx51_ecspi_clkdiv(spi_imx, spi_imx->speed_hz, &clk); - spi_imx->spi_bus_clk = clk; - /* set chip select to use */ ctrl |= MX51_ECSPI_CTRL_CS(spi->chip_select); - if (spi_imx->slave_mode && is_imx53_ecspi(spi_imx)) - ctrl |= (spi_imx->slave_burst * 8 - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; + /* + * The ctrl register must be written first, with the EN bit set other + * registers must not be written to. + */ + writel(ctrl, spi_imx->base + MX51_ECSPI_CTRL); + + testreg = readl(spi_imx->base + MX51_ECSPI_TESTREG); + if (spi->mode & SPI_LOOP) + testreg |= MX51_ECSPI_TESTREG_LBC; else - ctrl |= (spi_imx->bits_per_word - 1) - << MX51_ECSPI_CTRL_BL_OFFSET; + testreg &= ~MX51_ECSPI_TESTREG_LBC; + writel(testreg, spi_imx->base + MX51_ECSPI_TESTREG); /* * eCSPI burst completion by Chip Select signal in Slave mode @@ -556,26 +553,43 @@ static int mx51_ecspi_config(struct spi_device *spi) cfg &= ~MX51_ECSPI_CONFIG_SCLKPOL(spi->chip_select); cfg &= ~MX51_ECSPI_CONFIG_SCLKCTL(spi->chip_select); } + if (spi->mode & SPI_CS_HIGH) cfg |= MX51_ECSPI_CONFIG_SSBPOL(spi->chip_select); else cfg &= ~MX51_ECSPI_CONFIG_SSBPOL(spi->chip_select); + writel(cfg, spi_imx->base + MX51_ECSPI_CONFIG); + + return 0; +} + +static int mx51_ecspi_config(struct spi_device *spi) +{ + struct spi_imx_data *spi_imx = spi_master_get_devdata(spi->master); + u32 ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL); + u32 clk = spi_imx->speed_hz, delay; + + /* Clear BL field and set the right value */ + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK; + if (spi_imx->slave_mode && is_imx53_ecspi(spi_imx)) + ctrl |= (spi_imx->slave_burst * 8 - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; + else + ctrl |= (spi_imx->bits_per_word - 1) + << MX51_ECSPI_CTRL_BL_OFFSET; + + /* set clock speed */ + ctrl &= ~(0xf << MX51_ECSPI_CTRL_POSTDIV_OFFSET | + 0xf << MX51_ECSPI_CTRL_PREDIV_OFFSET); + ctrl |= mx51_ecspi_clkdiv(spi_imx, spi_imx->speed_hz, &clk); + spi_imx->spi_bus_clk = clk; + if (spi_imx->usedma) ctrl |= MX51_ECSPI_CTRL_SMC; - /* CTRL register always go first to bring out controller from reset */ writel(ctrl, spi_imx->base + MX51_ECSPI_CTRL); - reg = readl(spi_imx->base + MX51_ECSPI_TESTREG); - if (spi->mode & SPI_LOOP) - reg |= MX51_ECSPI_TESTREG_LBC; - else - reg &= ~MX51_ECSPI_TESTREG_LBC; - writel(reg, spi_imx->base + MX51_ECSPI_TESTREG); - - writel(cfg, spi_imx->base + MX51_ECSPI_CONFIG); - /* * Wait until the changes in the configuration register CONFIGREG * propagate into the hardware. It takes exactly one tick of the @@ -602,7 +616,6 @@ static void mx51_setup_wml(struct spi_imx_data *spi_imx) * Configure the DMA register: setup the watermark * and enable DMA request. */ - writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml - 1) | MX51_ECSPI_DMA_TX_WML(spi_imx->wml) | MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) | From 9d72eb94c71db49bf1e82fc0fc24d1d888b8e3e2 Mon Sep 17 00:00:00 2001 From: Evgeny Novikov Date: Thu, 23 Jul 2020 19:04:53 +0200 Subject: [PATCH 05/54] media: davinci: vpif_capture: fix potential double free [ Upstream commit 602649eadaa0c977e362e641f51ec306bc1d365d ] In case of errors vpif_probe_complete() releases memory for vpif_obj.sd and unregisters the V4L2 device. But then this is done again by vpif_probe() itself. The patch removes the cleaning from vpif_probe_complete(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Evgeny Novikov Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/platform/davinci/vpif_capture.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/media/platform/davinci/vpif_capture.c b/drivers/media/platform/davinci/vpif_capture.c index a96f53ce8088..cf1d11e6dd8c 100644 --- a/drivers/media/platform/davinci/vpif_capture.c +++ b/drivers/media/platform/davinci/vpif_capture.c @@ -1489,8 +1489,6 @@ probe_out: /* Unregister video device */ video_unregister_device(&ch->video_dev); } - kfree(vpif_obj.sd); - v4l2_device_unregister(&vpif_obj.v4l2_dev); return err; } From 9a2fc41acb69dd4e2a58d0c04346c3333c2341fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Nov 2023 15:57:13 +0100 Subject: [PATCH 06/54] hrtimers: Push pending hrtimers away from outgoing CPU earlier [ Upstream commit 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 ] 2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug") solved the straight forward CPU hotplug deadlock vs. the scheduler bandwidth timer. Yu discovered a more involved variant where a task which has a bandwidth timer started on the outgoing CPU holds a lock and then gets throttled. If the lock required by one of the CPU hotplug callbacks the hotplug operation deadlocks because the unthrottling timer event is not handled on the dying CPU and can only be recovered once the control CPU reaches the hotplug state which pulls the pending hrtimers from the dead CPU. Solve this by pushing the hrtimers away from the dying CPU in the dying callbacks. Nothing can queue a hrtimer on the dying CPU at that point because all other CPUs spin in stop_machine() with interrupts disabled and once the operation is finished the CPU is marked offline. Reported-by: Yu Liao Signed-off-by: Thomas Gleixner Tested-by: Liu Tie Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx Signed-off-by: Sasha Levin --- include/linux/cpuhotplug.h | 1 + include/linux/hrtimer.h | 4 ++-- kernel/cpu.c | 8 +++++++- kernel/time/hrtimer.c | 33 ++++++++++++--------------------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 71a0a5ffdbb1..dd9f035be63f 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -139,6 +139,7 @@ enum cpuhp_state { CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, + CPUHP_AP_HRTIMERS_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 542b4fa2cda9..3bdaa92a2cab 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -508,9 +508,9 @@ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU -int hrtimers_dead_cpu(unsigned int cpu); +int hrtimers_cpu_dying(unsigned int cpu); #else -#define hrtimers_dead_cpu NULL +#define hrtimers_cpu_dying NULL #endif #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index c9ca190ec034..34c09c3d37bc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1418,7 +1418,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, - .teardown.single = hrtimers_dead_cpu, + .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", @@ -1485,6 +1485,12 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, + [CPUHP_AP_HRTIMERS_DYING] = { + .name = "hrtimers:dying", + .startup.single = NULL, + .teardown.single = hrtimers_cpu_dying, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8512f06f0ebe..bf74f43e42af 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1922,29 +1922,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, ncpu = cpumask_first(cpu_active_mask); - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + tick_cancel_sched_timer(dying_cpu); + + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], @@ -1955,15 +1948,13 @@ int hrtimers_dead_cpu(unsigned int scpu) * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ - hrtimer_update_softirq_timer(new_base, false); + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); - raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); - local_bh_enable(); return 0; } From a12606e5ad0cee8f4ba3ec68561c4d6275d2df57 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 13 Nov 2023 21:13:23 +0100 Subject: [PATCH 07/54] netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test [ Upstream commit 28628fa952fefc7f2072ce6e8016968cc452b1ba ] Linkui Xiao reported that there's a race condition when ipset swap and destroy is called, which can lead to crash in add/del/test element operations. Swap then destroy are usual operations to replace a set with another one in a production system. The issue can in some cases be reproduced with the script: ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do # ... Ongoing traffic... ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done In the race case the possible order of the operations are CPU0 CPU1 ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 hash_net_kadt Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy removed it, hash_net_kadt crashes. The fix is to force ip_set_swap() to wait for all readers to finish accessing the old set pointers by calling synchronize_rcu(). The first version of the patch was written by Linkui Xiao . v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden ip_set_destroy() unnecessarily when all sets are destroyed. v3: Florian Westphal pointed out that all netfilter hooks run with rcu_read_lock() held and em_ipset.c wraps the entire ip_set_test() in rcu read lock/unlock pair. So there's no need to extend the rcu read locked area in ipset itself. Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.org/ Reported by: Linkui Xiao Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipset/ip_set_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 31756d1bf83e..031bb83aed70 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -64,6 +64,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -552,15 +554,10 @@ __ip_set_put_netlink(struct ip_set *set) static inline struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } int @@ -1227,6 +1224,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); + /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; } From dbd59898515f4438a17b5d7c4753ca3d43d98d92 Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:49 -0800 Subject: [PATCH 08/54] tg3: Move the [rt]x_dropped counters to tg3_napi [ Upstream commit 907d1bdb8b2cc0357d03a1c34d2a08d9943760b1 ] This change moves [rt]x_dropped counters to tg3_napi so that they can be updated by a single writer, race-free. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20231113182350.37472-1-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/tg3.c | 38 +++++++++++++++++++++++++---- drivers/net/ethernet/broadcom/tg3.h | 4 +-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f0b5c8a4d29f..84a5bddf614c 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -6859,7 +6859,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) desc_idx, *post_ptr); drop_it_no_recycle: /* Other statistics kept track of by card. */ - tp->rx_dropped++; + tnapi->rx_dropped++; goto next_pkt; } @@ -8163,7 +8163,7 @@ dma_error: drop: dev_kfree_skb_any(skb); drop_nofree: - tp->tx_dropped++; + tnapi->tx_dropped++; return NETDEV_TX_OK; } @@ -9342,7 +9342,7 @@ static void __tg3_set_rx_mode(struct net_device *); /* tp->lock is held. */ static int tg3_halt(struct tg3 *tp, int kind, bool silent) { - int err; + int err, i; tg3_stop_fw(tp); @@ -9363,6 +9363,13 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent) /* And make sure the next sample is new data */ memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats)); + + for (i = 0; i < TG3_IRQ_MAX_VECS; ++i) { + struct tg3_napi *tnapi = &tp->napi[i]; + + tnapi->rx_dropped = 0; + tnapi->tx_dropped = 0; + } } return err; @@ -11919,6 +11926,9 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) { struct rtnl_link_stats64 *old_stats = &tp->net_stats_prev; struct tg3_hw_stats *hw_stats = tp->hw_stats; + unsigned long rx_dropped; + unsigned long tx_dropped; + int i; stats->rx_packets = old_stats->rx_packets + get_stat64(&hw_stats->rx_ucast_packets) + @@ -11965,8 +11975,26 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) stats->rx_missed_errors = old_stats->rx_missed_errors + get_stat64(&hw_stats->rx_discards); - stats->rx_dropped = tp->rx_dropped; - stats->tx_dropped = tp->tx_dropped; + /* Aggregate per-queue counters. The per-queue counters are updated + * by a single writer, race-free. The result computed by this loop + * might not be 100% accurate (counters can be updated in the middle of + * the loop) but the next tg3_get_nstats() will recompute the current + * value so it is acceptable. + * + * Note that these counters wrap around at 4G on 32bit machines. + */ + rx_dropped = (unsigned long)(old_stats->rx_dropped); + tx_dropped = (unsigned long)(old_stats->tx_dropped); + + for (i = 0; i < tp->irq_cnt; i++) { + struct tg3_napi *tnapi = &tp->napi[i]; + + rx_dropped += tnapi->rx_dropped; + tx_dropped += tnapi->tx_dropped; + } + + stats->rx_dropped = rx_dropped; + stats->tx_dropped = tx_dropped; } static int tg3_get_regs_len(struct net_device *dev) diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index a772a33b685c..b8cc8ff4e878 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3018,6 +3018,7 @@ struct tg3_napi { u16 *rx_rcb_prod_idx; struct tg3_rx_prodring_set prodring; struct tg3_rx_buffer_desc *rx_rcb; + unsigned long rx_dropped; u32 tx_prod ____cacheline_aligned; u32 tx_cons; @@ -3026,6 +3027,7 @@ struct tg3_napi { u32 prodmbox; struct tg3_tx_buffer_desc *tx_ring; struct tg3_tx_ring_info *tx_buffers; + unsigned long tx_dropped; dma_addr_t status_mapping; dma_addr_t rx_rcb_mapping; @@ -3219,8 +3221,6 @@ struct tg3 { /* begin "everything else" cacheline(s) section */ - unsigned long rx_dropped; - unsigned long tx_dropped; struct rtnl_link_stats64 net_stats_prev; struct tg3_ethtool_stats estats_prev; From 7412dfbac8bead14fd249e307b5a0a4f10be2d9d Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:50 -0800 Subject: [PATCH 09/54] tg3: Increment tx_dropped in tg3_tso_bug() [ Upstream commit 17dd5efe5f36a96bd78012594fabe21efb01186b ] tg3_tso_bug() drops a packet if it cannot be segmented for any reason. The number of discarded frames should be incremented accordingly. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20231113182350.37472-2-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/tg3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 84a5bddf614c..68bb4a2ff7ce 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7889,8 +7889,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, segs = skb_gso_segment(skb, tp->dev->features & ~(NETIF_F_TSO | NETIF_F_TSO6)); - if (IS_ERR(segs) || !segs) + if (IS_ERR(segs) || !segs) { + tnapi->tx_dropped++; goto tg3_tso_bug_end; + } do { nskb = segs; From 19f67233d16f7f4be5cab30d6cd1d578867fa0da Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 15 Nov 2023 13:16:53 +0900 Subject: [PATCH 10/54] kconfig: fix memory leak from range properties [ Upstream commit ae1eff0349f2e908fc083630e8441ea6dc434dc0 ] Currently, sym_validate_range() duplicates the range string using xstrdup(), which is overwritten by a subsequent sym_calc_value() call. It results in a memory leak. Instead, only the pointer should be copied. Below is a test case, with a summary from Valgrind. [Test Kconfig] config FOO int "foo" range 10 20 [Test .config] CONFIG_FOO=0 [Before] LEAK SUMMARY: definitely lost: 3 bytes in 1 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,465 bytes in 21 blocks suppressed: 0 bytes in 0 blocks [After] LEAK SUMMARY: definitely lost: 0 bytes in 0 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,462 bytes in 20 blocks suppressed: 0 bytes in 0 blocks Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin --- scripts/kconfig/symbol.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 703b9b899ee9..5adb60b7e12f 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -119,9 +119,9 @@ static long long sym_get_range_val(struct symbol *sym, int base) static void sym_validate_range(struct symbol *sym) { struct property *prop; + struct symbol *range_sym; int base; long long val, val2; - char str[64]; switch (sym->type) { case S_INT: @@ -137,17 +137,15 @@ static void sym_validate_range(struct symbol *sym) if (!prop) return; val = strtoll(sym->curr.val, NULL, base); - val2 = sym_get_range_val(prop->expr->left.sym, base); + range_sym = prop->expr->left.sym; + val2 = sym_get_range_val(range_sym, base); if (val >= val2) { - val2 = sym_get_range_val(prop->expr->right.sym, base); + range_sym = prop->expr->right.sym; + val2 = sym_get_range_val(range_sym, base); if (val <= val2) return; } - if (sym->type == S_INT) - sprintf(str, "%lld", val2); - else - sprintf(str, "0x%llx", val2); - sym->curr.val = xstrdup(str); + sym->curr.val = range_sym->curr.val; } static void sym_set_changed(struct symbol *sym) From c28ff7e0adb3af7d217feb8f3eee3af37016ecbc Mon Sep 17 00:00:00 2001 From: YuanShang Date: Tue, 31 Oct 2023 10:32:37 +0800 Subject: [PATCH 11/54] drm/amdgpu: correct chunk_ptr to a pointer to chunk. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 50d51374b498457c4dea26779d32ccfed12ddaff ] The variable "chunk_ptr" should be a pointer pointing to a struct drm_amdgpu_cs_chunk instead of to a pointer of that. Signed-off-by: YuanShang Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 70e446c2acf8..94b06c918e80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -147,7 +147,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs } for (i = 0; i < p->nchunks; i++) { - struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL; + struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL; struct drm_amdgpu_cs_chunk user_chunk; uint32_t __user *cdata; From 22a9d504748433f079aa1633697c05de68fb352d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Nov 2023 16:06:30 +0000 Subject: [PATCH 12/54] ipv6: fix potential NULL deref in fib6_add() [ Upstream commit 75475bb51e78a3f54ad2f69380f2a1c985e85f2d ] If fib6_find_prefix() returns NULL, we should silently fallback using fib6_null_entry regardless of RT6_DEBUG value. syzbot reported: WARNING: CPU: 0 PID: 5477 at net/ipv6/ip6_fib.c:1516 fib6_add+0x310d/0x3fa0 net/ipv6/ip6_fib.c:1516 Modules linked in: CPU: 0 PID: 5477 Comm: syz-executor.0 Not tainted 6.7.0-rc2-syzkaller-00029-g9b6de136b5f0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 RIP: 0010:fib6_add+0x310d/0x3fa0 net/ipv6/ip6_fib.c:1516 Code: 00 48 8b 54 24 68 e8 42 22 00 00 48 85 c0 74 14 49 89 c6 e8 d5 d3 c2 f7 eb 5d e8 ce d3 c2 f7 e9 ca 00 00 00 e8 c4 d3 c2 f7 90 <0f> 0b 90 48 b8 00 00 00 00 00 fc ff df 48 8b 4c 24 38 80 3c 01 00 RSP: 0018:ffffc90005067740 EFLAGS: 00010293 RAX: ffffffff89cba5bc RBX: ffffc90005067ab0 RCX: ffff88801a2e9dc0 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: ffffc90005067980 R08: ffffffff89cbca85 R09: 1ffff110040d4b85 R10: dffffc0000000000 R11: ffffed10040d4b86 R12: 00000000ffffffff R13: 1ffff110051c3904 R14: ffff8880206a5c00 R15: ffff888028e1c820 FS: 00007f763783c6c0(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f763783bff8 CR3: 000000007f74d000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __ip6_ins_rt net/ipv6/route.c:1303 [inline] ip6_route_add+0x88/0x120 net/ipv6/route.c:3847 ipv6_route_ioctl+0x525/0x7b0 net/ipv6/route.c:4467 inet6_ioctl+0x21a/0x270 net/ipv6/af_inet6.c:575 sock_do_ioctl+0x152/0x460 net/socket.c:1220 sock_ioctl+0x615/0x8c0 net/socket.c:1339 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x45/0x110 arch/x86/entry/common.c:82 Fixes: 7bbfe00e0252 ("ipv6: fix general protection fault in fib6_add()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Wei Wang Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231129160630.3509216-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv6/ip6_fib.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5ff67cb8b6ac..92bc56028b8b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1351,13 +1351,9 @@ out: if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); + if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; - } -#endif fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } From 36192fc0e6661b1b377eaafdd5756616d5fc9a48 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 29 Nov 2023 21:58:53 -0800 Subject: [PATCH 13/54] hv_netvsc: rndis_filter needs to select NLS [ Upstream commit 6c89f49964375c904cea33c0247467873f4daf2c ] rndis_filter uses utf8s_to_utf16s() which is provided by setting NLS, so select NLS to fix the build error: ERROR: modpost: "utf8s_to_utf16s" [drivers/net/hyperv/hv_netvsc.ko] undefined! Fixes: 1ce09e899d28 ("hyperv: Add support for setting MAC from within guests") Signed-off-by: Randy Dunlap Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Wei Liu Cc: Dexuan Cui Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20231130055853.19069-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/hyperv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig index 0765d5f61714..386658ce1297 100644 --- a/drivers/net/hyperv/Kconfig +++ b/drivers/net/hyperv/Kconfig @@ -2,5 +2,6 @@ config HYPERV_NET tristate "Microsoft Hyper-V virtual network driver" depends on HYPERV select UCS2_STRING + select NLS help Select this option to enable the Hyper-V virtual network driver. From eea423aa8cfef1b1309eb26a9b092379f841b6ef Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 28 Jan 2021 20:48:02 +0100 Subject: [PATCH 14/54] net: arcnet: Fix RESET flag handling [ Upstream commit 01365633bd1c836240f9bbf86bbeee749795480a ] The main arcnet interrupt handler calls arcnet_close() then arcnet_open(), if the RESET status flag is encountered. This is invalid: 1) In general, interrupt handlers should never call ->ndo_stop() and ->ndo_open() functions. They are usually full of blocking calls and other methods that are expected to be called only from drivers init and exit code paths. 2) arcnet_close() contains a del_timer_sync(). If the irq handler interrupts the to-be-deleted timer, del_timer_sync() will just loop forever. 3) arcnet_close() also calls tasklet_kill(), which has a warning if called from irq context. 4) For device reset, the sequence "arcnet_close(); arcnet_open();" is not complete. Some children arcnet drivers have special init/exit code sequences, which then embed a call to arcnet_open() and arcnet_close() accordingly. Check drivers/net/arcnet/com20020.c. Run the device RESET sequence from a scheduled workqueue instead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20210128194802.727770-1-a.darwish@linutronix.de Signed-off-by: Jakub Kicinski Stable-dep-of: 6b17a597fc2f ("arcnet: restoring support for multiple Sohard Arcnet cards") Signed-off-by: Sasha Levin --- drivers/net/arcnet/arc-rimi.c | 4 +- drivers/net/arcnet/arcdevice.h | 6 +++ drivers/net/arcnet/arcnet.c | 66 +++++++++++++++++++++++++++++-- drivers/net/arcnet/com20020-isa.c | 4 +- drivers/net/arcnet/com20020-pci.c | 2 +- drivers/net/arcnet/com20020_cs.c | 2 +- drivers/net/arcnet/com90io.c | 4 +- drivers/net/arcnet/com90xx.c | 4 +- 8 files changed, 78 insertions(+), 14 deletions(-) diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c index a07e24970be4..3d0ce6d46a24 100644 --- a/drivers/net/arcnet/arc-rimi.c +++ b/drivers/net/arcnet/arc-rimi.c @@ -332,7 +332,7 @@ static int __init arc_rimi_init(void) dev->irq = 9; if (arcrimi_probe(dev)) { - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -349,7 +349,7 @@ static void __exit arc_rimi_exit(void) iounmap(lp->mem_start); release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); free_irq(dev->irq, dev); - free_netdev(dev); + free_arcdev(dev); } #ifndef MODULE diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h index d09b2b46ab63..1ad5e4a03d23 100644 --- a/drivers/net/arcnet/arcdevice.h +++ b/drivers/net/arcnet/arcdevice.h @@ -303,6 +303,10 @@ struct arcnet_local { int excnak_pending; /* We just got an excesive nak interrupt */ + /* RESET flag handling */ + int reset_in_progress; + struct work_struct reset_work; + struct { uint16_t sequence; /* sequence number (incs with each packet) */ __be16 aborted_seq; @@ -355,7 +359,9 @@ void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc) void arcnet_unregister_proto(struct ArcProto *proto); irqreturn_t arcnet_interrupt(int irq, void *dev_id); + struct net_device *alloc_arcdev(const char *name); +void free_arcdev(struct net_device *dev); int arcnet_open(struct net_device *dev); int arcnet_close(struct net_device *dev); diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 2b112d3d8540..c5ff13887a1d 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -387,10 +387,44 @@ static void arcnet_timer(struct timer_list *t) struct arcnet_local *lp = from_timer(lp, t, timer); struct net_device *dev = lp->dev; - if (!netif_carrier_ok(dev)) { + spin_lock_irq(&lp->lock); + + if (!lp->reset_in_progress && !netif_carrier_ok(dev)) { netif_carrier_on(dev); netdev_info(dev, "link up\n"); } + + spin_unlock_irq(&lp->lock); +} + +static void reset_device_work(struct work_struct *work) +{ + struct arcnet_local *lp; + struct net_device *dev; + + lp = container_of(work, struct arcnet_local, reset_work); + dev = lp->dev; + + /* Do not bring the network interface back up if an ifdown + * was already done. + */ + if (!netif_running(dev) || !lp->reset_in_progress) + return; + + rtnl_lock(); + + /* Do another check, in case of an ifdown that was triggered in + * the small race window between the exit condition above and + * acquiring RTNL. + */ + if (!netif_running(dev) || !lp->reset_in_progress) + goto out; + + dev_close(dev); + dev_open(dev); + +out: + rtnl_unlock(); } static void arcnet_reply_tasklet(unsigned long data) @@ -452,12 +486,25 @@ struct net_device *alloc_arcdev(const char *name) lp->dev = dev; spin_lock_init(&lp->lock); timer_setup(&lp->timer, arcnet_timer, 0); + INIT_WORK(&lp->reset_work, reset_device_work); } return dev; } EXPORT_SYMBOL(alloc_arcdev); +void free_arcdev(struct net_device *dev) +{ + struct arcnet_local *lp = netdev_priv(dev); + + /* Do not cancel this at ->ndo_close(), as the workqueue itself + * indirectly calls the ifdown path through dev_close(). + */ + cancel_work_sync(&lp->reset_work); + free_netdev(dev); +} +EXPORT_SYMBOL(free_arcdev); + /* Open/initialize the board. This is called sometime after booting when * the 'ifconfig' program is run. * @@ -587,6 +634,10 @@ int arcnet_close(struct net_device *dev) /* shut down the card */ lp->hw.close(dev); + + /* reset counters */ + lp->reset_in_progress = 0; + module_put(lp->hw.owner); return 0; } @@ -820,6 +871,9 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) spin_lock_irqsave(&lp->lock, flags); + if (lp->reset_in_progress) + goto out; + /* RESET flag was enabled - if device is not running, we must * clear it right away (but nothing else). */ @@ -852,11 +906,14 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) if (status & RESETflag) { arc_printk(D_NORMAL, dev, "spurious reset (status=%Xh)\n", status); - arcnet_close(dev); - arcnet_open(dev); + + lp->reset_in_progress = 1; + netif_stop_queue(dev); + netif_carrier_off(dev); + schedule_work(&lp->reset_work); /* get out of the interrupt handler! */ - break; + goto out; } /* RX is inhibited - we must have received something. * Prepare to receive into the next buffer. @@ -1052,6 +1109,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) udelay(1); lp->hw.intmask(dev, lp->intmask); +out: spin_unlock_irqrestore(&lp->lock, flags); return retval; } diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c index 38fa60ddaf2e..fada3f23970e 100644 --- a/drivers/net/arcnet/com20020-isa.c +++ b/drivers/net/arcnet/com20020-isa.c @@ -169,7 +169,7 @@ static int __init com20020_init(void) dev->irq = 9; if (com20020isa_probe(dev)) { - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -182,7 +182,7 @@ static void __exit com20020_exit(void) unregister_netdev(my_dev); free_irq(my_dev->irq, my_dev); release_region(my_dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(my_dev); + free_arcdev(my_dev); } #ifndef MODULE diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 9f44e2e458df..b4f8798d8c50 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -294,7 +294,7 @@ static void com20020pci_remove(struct pci_dev *pdev) unregister_netdev(dev); free_irq(dev->irq, dev); - free_netdev(dev); + free_arcdev(dev); } } diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index cf607ffcf358..9cc5eb6a8e90 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -177,7 +177,7 @@ static void com20020_detach(struct pcmcia_device *link) dev = info->dev; if (dev) { dev_dbg(&link->dev, "kfree...\n"); - free_netdev(dev); + free_arcdev(dev); } dev_dbg(&link->dev, "kfree2...\n"); kfree(info); diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c index 4e56aaf2b984..e9fc0577ddc0 100644 --- a/drivers/net/arcnet/com90io.c +++ b/drivers/net/arcnet/com90io.c @@ -394,7 +394,7 @@ static int __init com90io_init(void) err = com90io_probe(dev); if (err) { - free_netdev(dev); + free_arcdev(dev); return err; } @@ -417,7 +417,7 @@ static void __exit com90io_exit(void) free_irq(dev->irq, dev); release_region(dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(dev); + free_arcdev(dev); } module_init(com90io_init) diff --git a/drivers/net/arcnet/com90xx.c b/drivers/net/arcnet/com90xx.c index ca4a57c30bf8..5e38a2d6623c 100644 --- a/drivers/net/arcnet/com90xx.c +++ b/drivers/net/arcnet/com90xx.c @@ -554,7 +554,7 @@ err_free_irq: err_release_mem: release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); err_free_dev: - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -672,7 +672,7 @@ static void __exit com90xx_exit(void) release_region(dev->base_addr, ARCNET_TOTAL_SIZE); release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); - free_netdev(dev); + free_arcdev(dev); } } From 0571d23ff9d3b2c579b0f7564f1cf6f19c9be32d Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 14 Mar 2021 14:08:36 -0400 Subject: [PATCH 15/54] net: arcnet: com20020 fix error handling [ Upstream commit 6577b9a551aedb86bca6d4438c28386361845108 ] There are two issues when handling error case in com20020pci_probe() 1. priv might be not initialized yet when calling com20020pci_remove() from com20020pci_probe(), since the priv is set at the very last but it can jump to error handling in the middle and priv remains NULL. 2. memory leak - the net device is allocated in alloc_arcdev but not properly released if error happens in the middle of the big for loop [ 1.529110] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 1.531447] RIP: 0010:com20020pci_remove+0x15/0x60 [com20020_pci] [ 1.536805] Call Trace: [ 1.536939] com20020pci_probe+0x3f2/0x48c [com20020_pci] [ 1.537226] local_pci_probe+0x48/0x80 [ 1.539918] com20020pci_init+0x3f/0x1000 [com20020_pci] Signed-off-by: Tong Zhang Signed-off-by: David S. Miller Stable-dep-of: 6b17a597fc2f ("arcnet: restoring support for multiple Sohard Arcnet cards") Signed-off-by: Sasha Levin --- drivers/net/arcnet/com20020-pci.c | 34 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index b4f8798d8c50..28dccbc0e8d8 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -127,6 +127,8 @@ static int com20020pci_probe(struct pci_dev *pdev, int i, ioaddr, ret; struct resource *r; + ret = 0; + if (pci_enable_device(pdev)) return -EIO; @@ -142,6 +144,8 @@ static int com20020pci_probe(struct pci_dev *pdev, priv->ci = ci; mm = &ci->misc_map; + pci_set_drvdata(pdev, priv); + INIT_LIST_HEAD(&priv->list_dev); if (mm->size) { @@ -164,7 +168,7 @@ static int com20020pci_probe(struct pci_dev *pdev, dev = alloc_arcdev(device); if (!dev) { ret = -ENOMEM; - goto out_port; + break; } dev->dev_port = i; @@ -181,7 +185,7 @@ static int com20020pci_probe(struct pci_dev *pdev, pr_err("IO region %xh-%xh already allocated\n", ioaddr, ioaddr + cm->size - 1); ret = -EBUSY; - goto out_port; + goto err_free_arcdev; } /* Dummy access after Reset @@ -219,18 +223,18 @@ static int com20020pci_probe(struct pci_dev *pdev, if (arcnet_inb(ioaddr, COM20020_REG_R_STATUS) == 0xFF) { pr_err("IO address %Xh is empty!\n", ioaddr); ret = -EIO; - goto out_port; + goto err_free_arcdev; } if (com20020_check(dev)) { ret = -EIO; - goto out_port; + goto err_free_arcdev; } card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev), GFP_KERNEL); if (!card) { ret = -ENOMEM; - goto out_port; + goto err_free_arcdev; } card->index = i; @@ -256,29 +260,29 @@ static int com20020pci_probe(struct pci_dev *pdev, ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); if (ret) - goto out_port; + goto err_free_arcdev; ret = devm_led_classdev_register(&pdev->dev, &card->recon_led); if (ret) - goto out_port; + goto err_free_arcdev; dev_set_drvdata(&dev->dev, card); ret = com20020_found(dev, IRQF_SHARED); if (ret) - goto out_port; + goto err_free_arcdev; devm_arcnet_led_init(dev, dev->dev_id, i); list_add(&card->list, &priv->list_dev); + continue; + +err_free_arcdev: + free_arcdev(dev); + break; } - - pci_set_drvdata(pdev, priv); - - return 0; - -out_port: - com20020pci_remove(pdev); + if (ret) + com20020pci_remove(pdev); return ret; } From 2e4ad90b15a7341c2d96d2dc6df6d135d72256b6 Mon Sep 17 00:00:00 2001 From: Thomas Reichinger Date: Thu, 30 Nov 2023 12:35:03 +0100 Subject: [PATCH 16/54] arcnet: restoring support for multiple Sohard Arcnet cards [ Upstream commit 6b17a597fc2f13aaaa0a2780eb7edb9ae7ac9aea ] Probe of Sohard Arcnet cards fails, if 2 or more cards are installed in a system. See kernel log: [ 2.759203] arcnet: arcnet loaded [ 2.763648] arcnet:com20020: COM20020 chipset support (by David Woodhouse et al.) [ 2.770585] arcnet:com20020_pci: COM20020 PCI support [ 2.772295] com20020 0000:02:00.0: enabling device (0000 -> 0003) [ 2.772354] (unnamed net_device) (uninitialized): PLX-PCI Controls ... [ 3.071301] com20020 0000:02:00.0 arc0-0 (uninitialized): PCI COM20020: station FFh found at F080h, IRQ 101. [ 3.071305] com20020 0000:02:00.0 arc0-0 (uninitialized): Using CKP 64 - data rate 2.5 Mb/s [ 3.071534] com20020 0000:07:00.0: enabling device (0000 -> 0003) [ 3.071581] (unnamed net_device) (uninitialized): PLX-PCI Controls ... [ 3.369501] com20020 0000:07:00.0: Led pci:green:tx:0-0 renamed to pci:green:tx:0-0_1 due to name collision [ 3.369535] com20020 0000:07:00.0: Led pci:red:recon:0-0 renamed to pci:red:recon:0-0_1 due to name collision [ 3.370586] com20020 0000:07:00.0 arc0-0 (uninitialized): PCI COM20020: station E1h found at C000h, IRQ 35. [ 3.370589] com20020 0000:07:00.0 arc0-0 (uninitialized): Using CKP 64 - data rate 2.5 Mb/s [ 3.370608] com20020: probe of 0000:07:00.0 failed with error -5 commit 5ef216c1f848 ("arcnet: com20020-pci: add rotary index support") changes the device name of all COM20020 based PCI cards, even if only some cards support this: snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, i); The error happens because all Sohard Arcnet cards would be called arc0-0, since the Sohard Arcnet cards don't have a PLX rotary coder. I.e. EAE Arcnet cards have a PLX rotary coder, which sets the first decimal, ensuring unique devices names. This patch adds two new card feature flags to indicate which cards support LEDs and the PLX rotary coder. For EAE based cards the names still depend on the PLX rotary coder (untested, since missing EAE hardware). For Sohard based cards, this patch will result in devices being called arc0, arc1, ... (tested). Signed-off-by: Thomas Reichinger Fixes: 5ef216c1f848 ("arcnet: com20020-pci: add rotary index support") Link: https://lore.kernel.org/r/20231130113503.6812-1-thomas.reichinger@sohard.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/arcnet/arcdevice.h | 2 + drivers/net/arcnet/com20020-pci.c | 89 ++++++++++++++++--------------- 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h index 1ad5e4a03d23..c72ad5e2ac5f 100644 --- a/drivers/net/arcnet/arcdevice.h +++ b/drivers/net/arcnet/arcdevice.h @@ -191,6 +191,8 @@ do { \ #define ARC_IS_5MBIT 1 /* card default speed is 5MBit */ #define ARC_CAN_10MBIT 2 /* card uses COM20022, supporting 10MBit, but default is 2.5MBit. */ +#define ARC_HAS_LED 4 /* card has software controlled LEDs */ +#define ARC_HAS_ROTARY 8 /* card has rotary encoder */ /* information needed to define an encapsulation driver */ struct ArcProto { diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 28dccbc0e8d8..9d9e4200064f 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -213,12 +213,13 @@ static int com20020pci_probe(struct pci_dev *pdev, if (!strncmp(ci->name, "EAE PLX-PCI FB2", 15)) lp->backplane = 1; - /* Get the dev_id from the PLX rotary coder */ - if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15)) - dev_id_mask = 0x3; - dev->dev_id = (inb(priv->misc + ci->rotary) >> 4) & dev_id_mask; - - snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, i); + if (ci->flags & ARC_HAS_ROTARY) { + /* Get the dev_id from the PLX rotary coder */ + if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15)) + dev_id_mask = 0x3; + dev->dev_id = (inb(priv->misc + ci->rotary) >> 4) & dev_id_mask; + snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, i); + } if (arcnet_inb(ioaddr, COM20020_REG_R_STATUS) == 0xFF) { pr_err("IO address %Xh is empty!\n", ioaddr); @@ -230,6 +231,10 @@ static int com20020pci_probe(struct pci_dev *pdev, goto err_free_arcdev; } + ret = com20020_found(dev, IRQF_SHARED); + if (ret) + goto err_free_arcdev; + card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev), GFP_KERNEL); if (!card) { @@ -239,41 +244,39 @@ static int com20020pci_probe(struct pci_dev *pdev, card->index = i; card->pci_priv = priv; - card->tx_led.brightness_set = led_tx_set; - card->tx_led.default_trigger = devm_kasprintf(&pdev->dev, - GFP_KERNEL, "arc%d-%d-tx", - dev->dev_id, i); - card->tx_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, - "pci:green:tx:%d-%d", - dev->dev_id, i); - card->tx_led.dev = &dev->dev; - card->recon_led.brightness_set = led_recon_set; - card->recon_led.default_trigger = devm_kasprintf(&pdev->dev, - GFP_KERNEL, "arc%d-%d-recon", - dev->dev_id, i); - card->recon_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, - "pci:red:recon:%d-%d", - dev->dev_id, i); - card->recon_led.dev = &dev->dev; + if (ci->flags & ARC_HAS_LED) { + card->tx_led.brightness_set = led_tx_set; + card->tx_led.default_trigger = devm_kasprintf(&pdev->dev, + GFP_KERNEL, "arc%d-%d-tx", + dev->dev_id, i); + card->tx_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, + "pci:green:tx:%d-%d", + dev->dev_id, i); + + card->tx_led.dev = &dev->dev; + card->recon_led.brightness_set = led_recon_set; + card->recon_led.default_trigger = devm_kasprintf(&pdev->dev, + GFP_KERNEL, "arc%d-%d-recon", + dev->dev_id, i); + card->recon_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, + "pci:red:recon:%d-%d", + dev->dev_id, i); + card->recon_led.dev = &dev->dev; + + ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); + if (ret) + goto err_free_arcdev; + + ret = devm_led_classdev_register(&pdev->dev, &card->recon_led); + if (ret) + goto err_free_arcdev; + + dev_set_drvdata(&dev->dev, card); + devm_arcnet_led_init(dev, dev->dev_id, i); + } + card->dev = dev; - - ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); - if (ret) - goto err_free_arcdev; - - ret = devm_led_classdev_register(&pdev->dev, &card->recon_led); - if (ret) - goto err_free_arcdev; - - dev_set_drvdata(&dev->dev, card); - - ret = com20020_found(dev, IRQF_SHARED); - if (ret) - goto err_free_arcdev; - - devm_arcnet_led_init(dev, dev->dev_id, i); - list_add(&card->list, &priv->list_dev); continue; @@ -329,7 +332,7 @@ static struct com20020_pci_card_info card_info_5mbit = { }; static struct com20020_pci_card_info card_info_sohard = { - .name = "PLX-PCI", + .name = "SOHARD SH ARC-PCI", .devcount = 1, /* SOHARD needs PCI base addr 4 */ .chan_map_tbl = { @@ -364,7 +367,7 @@ static struct com20020_pci_card_info card_info_eae_arc1 = { }, }, .rotary = 0x0, - .flags = ARC_CAN_10MBIT, + .flags = ARC_HAS_ROTARY | ARC_HAS_LED | ARC_CAN_10MBIT, }; static struct com20020_pci_card_info card_info_eae_ma1 = { @@ -396,7 +399,7 @@ static struct com20020_pci_card_info card_info_eae_ma1 = { }, }, .rotary = 0x0, - .flags = ARC_CAN_10MBIT, + .flags = ARC_HAS_ROTARY | ARC_HAS_LED | ARC_CAN_10MBIT, }; static struct com20020_pci_card_info card_info_eae_fb2 = { @@ -421,7 +424,7 @@ static struct com20020_pci_card_info card_info_eae_fb2 = { }, }, .rotary = 0x0, - .flags = ARC_CAN_10MBIT, + .flags = ARC_HAS_ROTARY | ARC_HAS_LED | ARC_CAN_10MBIT, }; static const struct pci_device_id com20020pci_id_table[] = { From 1fbcc804891c4eafd7155e72ebec9001537711c8 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 3 Dec 2023 01:14:41 +0900 Subject: [PATCH 17/54] ipv4: ip_gre: Avoid skb_pull() failure in ipgre_xmit() [ Upstream commit 80d875cfc9d3711a029f234ef7d680db79e8fa4b ] In ipgre_xmit(), skb_pull() may fail even if pskb_inet_may_pull() returns true. For example, applications can use PF_PACKET to create a malformed packet with no IP header. This type of packet causes a problem such as uninit-value access. This patch ensures that skb_pull() can pull the required size by checking the skb with pskb_network_may_pull() before skb_pull(). Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Shigeru Yoshida Reviewed-by: Eric Dumazet Reviewed-by: Suman Ghosh Link: https://lore.kernel.org/r/20231202161441.221135-1-syoshida@redhat.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/ipv4/ip_gre.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e16373640f4c..38c8db78cda1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -683,15 +683,18 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; - /* Pull skb since ip_tunnel_xmit() needs skb->data pointing - * to gre header. - */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + if (!pskb_network_may_pull(skb, pull_len)) + goto free_skb; + + /* ip_tunnel_xmit() needs skb->data pointing to gre header. */ + skb_pull(skb, pull_len); skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && From d371b0590c9ccc32bd02f2a3818e9fbd413f10b2 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Mon, 4 Dec 2023 22:32:32 +0800 Subject: [PATCH 18/54] net: hns: fix fake link up on xge port [ Upstream commit f708aba40f9c1eeb9c7e93ed4863b5f85b09b288 ] If a xge port just connect with an optical module and no fiber, it may have a fake link up because there may be interference on the hardware. This patch adds an anti-shake to avoid the problem. And the time of anti-shake is base on tests. Fixes: b917078c1c10 ("net: hns: Add ACPI support to check SFP present") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c index cfdc92de9dc0..d2791bcff5d4 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c @@ -70,6 +70,27 @@ static enum mac_mode hns_get_enet_interface(const struct hns_mac_cb *mac_cb) } } +static u32 hns_mac_link_anti_shake(struct mac_driver *mac_ctrl_drv) +{ +#define HNS_MAC_LINK_WAIT_TIME 5 +#define HNS_MAC_LINK_WAIT_CNT 40 + + u32 link_status = 0; + int i; + + if (!mac_ctrl_drv->get_link_status) + return link_status; + + for (i = 0; i < HNS_MAC_LINK_WAIT_CNT; i++) { + msleep(HNS_MAC_LINK_WAIT_TIME); + mac_ctrl_drv->get_link_status(mac_ctrl_drv, &link_status); + if (!link_status) + break; + } + + return link_status; +} + void hns_mac_get_link_status(struct hns_mac_cb *mac_cb, u32 *link_status) { struct mac_driver *mac_ctrl_drv; @@ -87,6 +108,14 @@ void hns_mac_get_link_status(struct hns_mac_cb *mac_cb, u32 *link_status) &sfp_prsnt); if (!ret) *link_status = *link_status && sfp_prsnt; + + /* for FIBER port, it may have a fake link up. + * when the link status changes from down to up, we need to do + * anti-shake. the anti-shake time is base on tests. + * only FIBER port need to do this. + */ + if (*link_status && !mac_cb->link) + *link_status = hns_mac_link_anti_shake(mac_ctrl_drv); } mac_cb->link = *link_status; From 22566a81f62b96400fbe75f3f67de3c5b83fe453 Mon Sep 17 00:00:00 2001 From: Lukasz Pawelczyk Date: Fri, 10 May 2019 13:46:22 +0200 Subject: [PATCH 19/54] netfilter: xt_owner: Add supplementary groups option [ Upstream commit ea6cc2fd8a2b89ab6dcd096ba6dbc1ecbdf26564 ] The XT_OWNER_SUPPL_GROUPS flag causes GIDs specified with XT_OWNER_GID to be also checked in the supplementary groups of a process. f_cred->group_info cannot be modified during its lifetime and f_cred holds a reference to it so it's safe to use. Signed-off-by: Lukasz Pawelczyk Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 7ae836a3d630 ("netfilter: xt_owner: Fix for unsafe access of sk->sk_socket") Signed-off-by: Sasha Levin --- include/uapi/linux/netfilter/xt_owner.h | 7 ++++--- net/netfilter/xt_owner.c | 23 ++++++++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h index fa3ad84957d5..9e98c09eda32 100644 --- a/include/uapi/linux/netfilter/xt_owner.h +++ b/include/uapi/linux/netfilter/xt_owner.h @@ -5,9 +5,10 @@ #include enum { - XT_OWNER_UID = 1 << 0, - XT_OWNER_GID = 1 << 1, - XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_UID = 1 << 0, + XT_OWNER_GID = 1 << 1, + XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_SUPPL_GROUPS = 1 << 3, }; struct xt_owner_match_info { diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 46686fb73784..a8784502aca6 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -91,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } From 6b5bf47c023b1f71afcba818804756aae901822f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 5 Dec 2023 21:58:12 +0100 Subject: [PATCH 20/54] netfilter: xt_owner: Fix for unsafe access of sk->sk_socket [ Upstream commit 7ae836a3d630e146b732fe8ef7d86b243748751f ] A concurrently running sock_orphan() may NULL the sk_socket pointer in between check and deref. Follow other users (like nft_meta.c for instance) and acquire sk_callback_lock before dereferencing sk_socket. Fixes: 0265ab44bacc ("[NETFILTER]: merge ipt_owner/ip6t_owner in xt_owner") Reported-by: Jann Horn Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/xt_owner.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index a8784502aca6..0c101b25cacf 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = sk->sk_socket->file; - if (filp == NULL) + read_lock_bh(&sk->sk_callback_lock); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (filp == NULL) { + read_unlock_bh(&sk->sk_callback_lock); return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; + } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ - !(info->invert & XT_OWNER_UID)) + !(info->invert & XT_OWNER_UID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } if (info->match & XT_OWNER_GID) { @@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } } - if (match ^ !(info->invert & XT_OWNER_GID)) + if (match ^ !(info->invert & XT_OWNER_GID)) { + read_unlock_bh(&sk->sk_callback_lock); return false; + } } + read_unlock_bh(&sk->sk_callback_lock); return true; } From 458f07ffeccd17f99942311e09ef574ddf4a414a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Dec 2023 16:18:41 +0000 Subject: [PATCH 21/54] tcp: do not accept ACK of bytes we never sent [ Upstream commit 3d501dd326fb1c73f1b8206d4c6e1d7b15c07e27 ] This patch is based on a detailed report and ideas from Yepeng Pan and Christian Rossow. ACK seq validation is currently following RFC 5961 5.2 guidelines: The ACK value is considered acceptable only if it is in the range of ((SND.UNA - MAX.SND.WND) <= SEG.ACK <= SND.NXT). All incoming segments whose ACK value doesn't satisfy the above condition MUST be discarded and an ACK sent back. It needs to be noted that RFC 793 on page 72 (fifth check) says: "If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored. If the ACK acknowledges something not yet sent (SEG.ACK > SND.NXT) then send an ACK, drop the segment, and return". The "ignored" above implies that the processing of the incoming data segment continues, which means the ACK value is treated as acceptable. This mitigation makes the ACK check more stringent since any ACK < SND.UNA wouldn't be accepted, instead only ACKs that are in the range ((SND.UNA - MAX.SND.WND) <= SEG.ACK <= SND.NXT) get through. This can be refined for new (and possibly spoofed) flows, by not accepting ACK for bytes that were never sent. This greatly improves TCP security at a little cost. I added a Fixes: tag to make sure this patch will reach stable trees, even if the 'blamed' patch was adhering to the RFC. tp->bytes_acked was added in linux-4.2 Following packetdrill test (courtesy of Yepeng Pan) shows the issue at hand: 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1024) = 0 // ---------------- Handshake ------------------- // // when window scale is set to 14 the window size can be extended to // 65535 * (2^14) = 1073725440. Linux would accept an ACK packet // with ack number in (Server_ISN+1-1073725440. Server_ISN+1) // ,though this ack number acknowledges some data never // sent by the server. +0 < S 0:0(0) win 65535 +0 > S. 0:0(0) ack 1 <...> +0 < . 1:1(0) ack 1 win 65535 +0 accept(3, ..., ...) = 4 // For the established connection, we send an ACK packet, // the ack packet uses ack number 1 - 1073725300 + 2^32, // where 2^32 is used to wrap around. // Note: we used 1073725300 instead of 1073725440 to avoid possible // edge cases. // 1 - 1073725300 + 2^32 = 3221241997 // Oops, old kernels happily accept this packet. +0 < . 1:1001(1000) ack 3221241997 win 65535 // After the kernel fix the following will be replaced by a challenge ACK, // and prior malicious frame would be dropped. +0 > . 1:1(0) ack 1001 Fixes: 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation") Signed-off-by: Eric Dumazet Reported-by: Yepeng Pan Reported-by: Christian Rossow Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20231205161841.2702925-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0052a6194cc1..407ad07dc598 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3640,8 +3640,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { + u32 max_window; + + /* do not accept ACK for bytes we never sent. */ + max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ - if (before(ack, prior_snd_una - tp->max_window)) { + if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk, skb); return -1; From 98936b782e71fc5c2fa613921aa0bcbb674ad9b5 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 21 Nov 2023 00:29:47 -0800 Subject: [PATCH 22/54] RDMA/bnxt_re: Correct module description string [ Upstream commit 422b19f7f006e813ee0865aadce6a62b3c263c42 ] The word "Driver" is repeated twice in the "modinfo bnxt_re" output description. Fix it. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1700555387-6277-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/infiniband/hw/bnxt_re/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index f1b666c80f36..b34e221507f5 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -70,7 +70,7 @@ static char version[] = BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n"; MODULE_AUTHOR("Eddie Wai "); -MODULE_DESCRIPTION(BNXT_RE_DESC " Driver"); +MODULE_DESCRIPTION(BNXT_RE_DESC); MODULE_LICENSE("Dual BSD/GPL"); /* globals */ From 19f671d446ee9a8c4e40a85288496e6fd68334c9 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 24 Nov 2023 19:27:47 +0100 Subject: [PATCH 23/54] hwmon: (acpi_power_meter) Fix 4.29 MW bug [ Upstream commit 1fefca6c57fb928d2131ff365270cbf863d89c88 ] The ACPI specification says: "If an error occurs while obtaining the meter reading or if the value is not available then an Integer with all bits set is returned" Since the "integer" is 32 bits in case of the ACPI power meter, userspace will get a power reading of 2^32 * 1000 miliwatts (~4.29 MW) in case of such an error. This was discovered due to a lm_sensors bugreport (https://github.com/lm-sensors/lm-sensors/issues/460). Fix this by returning -ENODATA instead. Tested-by: Fixes: de584afa5e18 ("hwmon driver for ACPI 4.0 power meters") Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231124182747.13956-1-W_Armin@gmx.de Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/acpi_power_meter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c index 2f40da04a944..b1e5ad59ec71 100644 --- a/drivers/hwmon/acpi_power_meter.c +++ b/drivers/hwmon/acpi_power_meter.c @@ -45,6 +45,7 @@ ACPI_MODULE_NAME(ACPI_POWER_METER_NAME); #define POWER_METER_CAN_NOTIFY (1 << 3) #define POWER_METER_IS_BATTERY (1 << 8) #define UNKNOWN_HYSTERESIS 0xFFFFFFFF +#define UNKNOWN_POWER 0xFFFFFFFF #define METER_NOTIFY_CONFIG 0x80 #define METER_NOTIFY_TRIP 0x81 @@ -356,6 +357,9 @@ static ssize_t show_power(struct device *dev, update_meter(resource); mutex_unlock(&resource->lock); + if (resource->power == UNKNOWN_POWER) + return -ENODATA; + return sprintf(buf, "%llu\n", resource->power * 1000); } From 469d1f9ed0edaa34db28776ac979de983bc50e5c Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:35 +0100 Subject: [PATCH 24/54] tracing: Fix a warning when allocating buffered events fails [ Upstream commit 34209fe83ef8404353f91ab4ea4035dbc9922d04 ] Function trace_buffered_event_disable() produces an unexpected warning when the previous call to trace_buffered_event_enable() fails to allocate pages for buffered events. The situation can occur as follows: * The counter trace_buffered_event_ref is at 0. * The soft mode gets enabled for some event and trace_buffered_event_enable() is called. The function increments trace_buffered_event_ref to 1 and starts allocating event pages. * The allocation fails for some page and trace_buffered_event_disable() is called for cleanup. * Function trace_buffered_event_disable() decrements trace_buffered_event_ref back to 0, recognizes that it was the last use of buffered events and frees all allocated pages. * The control goes back to trace_buffered_event_enable() which returns. The caller of trace_buffered_event_enable() has no information that the function actually failed. * Some time later, the soft mode is disabled for the same event. Function trace_buffered_event_disable() is called. It warns on "WARN_ON_ONCE(!trace_buffered_event_ref)" and returns. Buffered events are just an optimization and can handle failures. Make trace_buffered_event_enable() exit on the first failure and left any cleanup later to when trace_buffered_event_disable() is called. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-3-petr.pavlu@suse.com Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f44c8f1fd3ec..a6d84349bcc4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2209,8 +2209,11 @@ void trace_buffered_event_enable(void) for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto failed; + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } event = page_address(page); memset(event, 0, sizeof(*event)); @@ -2224,10 +2227,6 @@ void trace_buffered_event_enable(void) WARN_ON_ONCE(1); preempt_enable(); } - - return; - failed: - trace_buffered_event_disable(); } static void enable_trace_buffered_event(void *data) From 9682d5777a84c23ba85328379277c632cb24c274 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 23 Nov 2023 16:19:41 +0800 Subject: [PATCH 25/54] scsi: be2iscsi: Fix a memleak in beiscsi_init_wrb_handle() [ Upstream commit 235f2b548d7f4ac5931d834f05d3f7f5166a2e72 ] When an error occurs in the for loop of beiscsi_init_wrb_handle(), we should free phwi_ctxt->be_wrbq before returning an error code to prevent potential memleak. Fixes: a7909b396ba7 ("[SCSI] be2iscsi: Fix dynamic CID allocation Mechanism in driver") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20231123081941.24854-1-dinghao.liu@zju.edu.cn Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/be2iscsi/be_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index 50e9b4b68357..295eef40e275 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -2713,6 +2713,7 @@ init_wrb_hndl_failed: kfree(pwrb_context->pwrb_handle_base); kfree(pwrb_context->pwrb_handle_basestd); } + kfree(phwi_ctxt->be_wrbq); return -ENOMEM; } From 47aedb4ce2fd2ef3342f13543b3491d922c6a4a3 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 22 Nov 2023 14:46:36 +0800 Subject: [PATCH 26/54] ARM: imx: Check return value of devm_kasprintf in imx_mmdc_perf_init [ Upstream commit 1c2b1049af3f86545fcc5fae0fc725fb64b3a09e ] devm_kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Release the id allocated in 'mmdc_pmu_init' when 'devm_kasprintf' return NULL Suggested-by: Ahmad Fatoum Fixes: e76bdfd7403a ("ARM: imx: Added perf functionality to mmdc driver") Signed-off-by: Kunwu Chan Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/mach-imx/mmdc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c index 965a41572283..67cf02c2f2ec 100644 --- a/arch/arm/mach-imx/mmdc.c +++ b/arch/arm/mach-imx/mmdc.c @@ -513,6 +513,10 @@ static int imx_mmdc_perf_init(struct platform_device *pdev, void __iomem *mmdc_b name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "mmdc%d", ret); + if (!name) { + ret = -ENOMEM; + goto pmu_release_id; + } pmu_mmdc->mmdc_ipg_clk = mmdc_ipg_clk; pmu_mmdc->devtype_data = (struct fsl_mmdc_devtype_data *)of_id->data; @@ -535,9 +539,10 @@ static int imx_mmdc_perf_init(struct platform_device *pdev, void __iomem *mmdc_b pmu_register_err: pr_warn("MMDC Perf PMU failed (%d), disabled\n", ret); - ida_simple_remove(&mmdc_ida, pmu_mmdc->id); cpuhp_state_remove_instance_nocalls(cpuhp_mmdc_state, &pmu_mmdc->node); hrtimer_cancel(&pmu_mmdc->hrtimer); +pmu_release_id: + ida_simple_remove(&mmdc_ida, pmu_mmdc->id); pmu_free: kfree(pmu_mmdc); return ret; From 4563064684a0d0cdefd72ef49dbec3fe93404b3f Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Thu, 13 Feb 2020 10:52:56 +0800 Subject: [PATCH 27/54] ARM: dts: imx: make gpt node name generic [ Upstream commit 7c48b086965873c0aa93d99773cf64c033b76b2f ] Node name should be generic, use "timer" instead of "gpt" for gpt node. Signed-off-by: Anson Huang Signed-off-by: Shawn Guo Stable-dep-of: 397caf68e2d3 ("ARM: dts: imx7: Declare timers compatible with fsl,imx6dl-gpt") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx6qdl.dtsi | 2 +- arch/arm/boot/dts/imx6sl.dtsi | 2 +- arch/arm/boot/dts/imx6sx.dtsi | 2 +- arch/arm/boot/dts/imx6ul.dtsi | 4 ++-- arch/arm/boot/dts/imx7s.dtsi | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi index fcd7e4dc949a..b6aac90d1233 100644 --- a/arch/arm/boot/dts/imx6qdl.dtsi +++ b/arch/arm/boot/dts/imx6qdl.dtsi @@ -565,7 +565,7 @@ status = "disabled"; }; - gpt: gpt@2098000 { + gpt: timer@2098000 { compatible = "fsl,imx6q-gpt", "fsl,imx31-gpt"; reg = <0x02098000 0x4000>; interrupts = <0 55 IRQ_TYPE_LEVEL_HIGH>; diff --git a/arch/arm/boot/dts/imx6sl.dtsi b/arch/arm/boot/dts/imx6sl.dtsi index b00f791471c6..e9f6cffc7eee 100644 --- a/arch/arm/boot/dts/imx6sl.dtsi +++ b/arch/arm/boot/dts/imx6sl.dtsi @@ -374,7 +374,7 @@ clock-names = "ipg", "per"; }; - gpt: gpt@2098000 { + gpt: timer@2098000 { compatible = "fsl,imx6sl-gpt"; reg = <0x02098000 0x4000>; interrupts = <0 55 IRQ_TYPE_LEVEL_HIGH>; diff --git a/arch/arm/boot/dts/imx6sx.dtsi b/arch/arm/boot/dts/imx6sx.dtsi index a0c0e631ebbe..4cabcf32af06 100644 --- a/arch/arm/boot/dts/imx6sx.dtsi +++ b/arch/arm/boot/dts/imx6sx.dtsi @@ -468,7 +468,7 @@ status = "disabled"; }; - gpt: gpt@2098000 { + gpt: timer@2098000 { compatible = "fsl,imx6sx-gpt", "fsl,imx6dl-gpt"; reg = <0x02098000 0x4000>; interrupts = ; diff --git a/arch/arm/boot/dts/imx6ul.dtsi b/arch/arm/boot/dts/imx6ul.dtsi index dcb187995f76..7c3d8acf87be 100644 --- a/arch/arm/boot/dts/imx6ul.dtsi +++ b/arch/arm/boot/dts/imx6ul.dtsi @@ -423,7 +423,7 @@ status = "disabled"; }; - gpt1: gpt@2098000 { + gpt1: timer@2098000 { compatible = "fsl,imx6ul-gpt", "fsl,imx6sx-gpt"; reg = <0x02098000 0x4000>; interrupts = ; @@ -696,7 +696,7 @@ reg = <0x020e4000 0x4000>; }; - gpt2: gpt@20e8000 { + gpt2: timer@20e8000 { compatible = "fsl,imx6ul-gpt", "fsl,imx6sx-gpt"; reg = <0x020e8000 0x4000>; interrupts = ; diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi index 8a6d698e253d..7cd05ab0b71f 100644 --- a/arch/arm/boot/dts/imx7s.dtsi +++ b/arch/arm/boot/dts/imx7s.dtsi @@ -439,7 +439,7 @@ fsl,input-sel = <&iomuxc>; }; - gpt1: gpt@302d0000 { + gpt1: timer@302d0000 { compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; reg = <0x302d0000 0x10000>; interrupts = ; @@ -448,7 +448,7 @@ clock-names = "ipg", "per"; }; - gpt2: gpt@302e0000 { + gpt2: timer@302e0000 { compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; reg = <0x302e0000 0x10000>; interrupts = ; @@ -458,7 +458,7 @@ status = "disabled"; }; - gpt3: gpt@302f0000 { + gpt3: timer@302f0000 { compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; reg = <0x302f0000 0x10000>; interrupts = ; @@ -468,7 +468,7 @@ status = "disabled"; }; - gpt4: gpt@30300000 { + gpt4: timer@30300000 { compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; reg = <0x30300000 0x10000>; interrupts = ; From 6aed43ffdd71b0bcba92399f8817e12438388558 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 27 Nov 2023 17:05:01 +0100 Subject: [PATCH 28/54] ARM: dts: imx7: Declare timers compatible with fsl,imx6dl-gpt [ Upstream commit 397caf68e2d36532054cb14ae8995537f27f8b61 ] The timer nodes declare compatibility with "fsl,imx6sx-gpt", which itself is compatible with "fsl,imx6dl-gpt". Switch the fallback compatible from "fsl,imx6sx-gpt" to "fsl,imx6dl-gpt". Fixes: 949673450291 ("ARM: dts: add imx7d soc dtsi file") Signed-off-by: Philipp Zabel Signed-off-by: Roland Hieber Signed-off-by: Shawn Guo Signed-off-by: Sasha Levin --- arch/arm/boot/dts/imx7s.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi index 7cd05ab0b71f..36c00fe29f4f 100644 --- a/arch/arm/boot/dts/imx7s.dtsi +++ b/arch/arm/boot/dts/imx7s.dtsi @@ -440,7 +440,7 @@ }; gpt1: timer@302d0000 { - compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; + compatible = "fsl,imx7d-gpt", "fsl,imx6dl-gpt"; reg = <0x302d0000 0x10000>; interrupts = ; clocks = <&clks IMX7D_GPT1_ROOT_CLK>, @@ -449,7 +449,7 @@ }; gpt2: timer@302e0000 { - compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; + compatible = "fsl,imx7d-gpt", "fsl,imx6dl-gpt"; reg = <0x302e0000 0x10000>; interrupts = ; clocks = <&clks IMX7D_GPT2_ROOT_CLK>, @@ -459,7 +459,7 @@ }; gpt3: timer@302f0000 { - compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; + compatible = "fsl,imx7d-gpt", "fsl,imx6dl-gpt"; reg = <0x302f0000 0x10000>; interrupts = ; clocks = <&clks IMX7D_GPT3_ROOT_CLK>, @@ -469,7 +469,7 @@ }; gpt4: timer@30300000 { - compatible = "fsl,imx7d-gpt", "fsl,imx6sx-gpt"; + compatible = "fsl,imx7d-gpt", "fsl,imx6dl-gpt"; reg = <0x30300000 0x10000>; interrupts = ; clocks = <&clks IMX7D_GPT4_ROOT_CLK>, From 25f8c84d8f2083874f8da361856c3f483c69efa0 Mon Sep 17 00:00:00 2001 From: Jason Zhang Date: Wed, 6 Dec 2023 09:31:39 +0800 Subject: [PATCH 29/54] ALSA: pcm: fix out-of-bounds in snd_pcm_state_names commit 2b3a7a302c9804e463f2ea5b54dc3a6ad106a344 upstream. The pcm state can be SNDRV_PCM_STATE_DISCONNECTED at disconnect callback, and there is not an entry of SNDRV_PCM_STATE_DISCONNECTED in snd_pcm_state_names. This patch adds the missing entry to resolve this issue. cat /proc/asound/card2/pcm0p/sub0/status That results in stack traces like the following: [ 99.702732][ T5171] Unexpected kernel BRK exception at EL1 [ 99.702774][ T5171] Internal error: BRK handler: f2005512 [#1] PREEMPT SMP [ 99.703858][ T5171] Modules linked in: bcmdhd(E) (...) [ 99.747425][ T5171] CPU: 3 PID: 5171 Comm: cat Tainted: G C OE 5.10.189-android13-4-00003-g4a17384380d8-ab11086999 #1 [ 99.748447][ T5171] Hardware name: Rockchip RK3588 CVTE V10 Board (DT) [ 99.749024][ T5171] pstate: 60400005 (nZCv daif +PAN -UAO -TCO BTYPE=--) [ 99.749616][ T5171] pc : snd_pcm_substream_proc_status_read+0x264/0x2bc [ 99.750204][ T5171] lr : snd_pcm_substream_proc_status_read+0xa4/0x2bc [ 99.750778][ T5171] sp : ffffffc0175abae0 [ 99.751132][ T5171] x29: ffffffc0175abb80 x28: ffffffc009a2c498 [ 99.751665][ T5171] x27: 0000000000000001 x26: ffffff810cbae6e8 [ 99.752199][ T5171] x25: 0000000000400cc0 x24: ffffffc0175abc60 [ 99.752729][ T5171] x23: 0000000000000000 x22: ffffff802f558400 [ 99.753263][ T5171] x21: ffffff81d8d8ff00 x20: ffffff81020cdc00 [ 99.753795][ T5171] x19: ffffff802d110000 x18: ffffffc014fbd058 [ 99.754326][ T5171] x17: 0000000000000000 x16: 0000000000000000 [ 99.754861][ T5171] x15: 000000000000c276 x14: ffffffff9a976fda [ 99.755392][ T5171] x13: 0000000065689089 x12: 000000000000d72e [ 99.755923][ T5171] x11: ffffff802d110000 x10: 00000000000000e0 [ 99.756457][ T5171] x9 : 9c431600c8385d00 x8 : 0000000000000008 [ 99.756990][ T5171] x7 : 0000000000000000 x6 : 000000000000003f [ 99.757522][ T5171] x5 : 0000000000000040 x4 : ffffffc0175abb70 [ 99.758056][ T5171] x3 : 0000000000000001 x2 : 0000000000000001 [ 99.758588][ T5171] x1 : 0000000000000000 x0 : 0000000000000000 [ 99.759123][ T5171] Call trace: [ 99.759404][ T5171] snd_pcm_substream_proc_status_read+0x264/0x2bc [ 99.759958][ T5171] snd_info_seq_show+0x54/0xa4 [ 99.760370][ T5171] seq_read_iter+0x19c/0x7d4 [ 99.760770][ T5171] seq_read+0xf0/0x128 [ 99.761117][ T5171] proc_reg_read+0x100/0x1f8 [ 99.761515][ T5171] vfs_read+0xf4/0x354 [ 99.761869][ T5171] ksys_read+0x7c/0x148 [ 99.762226][ T5171] __arm64_sys_read+0x20/0x30 [ 99.762625][ T5171] el0_svc_common+0xd0/0x1e4 [ 99.763023][ T5171] el0_svc+0x28/0x98 [ 99.763358][ T5171] el0_sync_handler+0x8c/0xf0 [ 99.763759][ T5171] el0_sync+0x1b8/0x1c0 [ 99.764118][ T5171] Code: d65f03c0 b9406102 17ffffae 94191565 (d42aa240) [ 99.764715][ T5171] ---[ end trace 1eeffa3e17c58e10 ]--- [ 99.780720][ T5171] Kernel panic - not syncing: BRK handler: Fatal exception Signed-off-by: Jason Zhang Cc: Link: https://lore.kernel.org/r/20231206013139.20506-1-jason.zhang@rock-chips.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/pcm.c b/sound/core/pcm.c index 8eed6244b832..601f60bb2e8a 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -266,6 +266,7 @@ static char *snd_pcm_state_names[] = { STATE(DRAINING), STATE(PAUSED), STATE(SUSPENDED), + STATE(DISCONNECTED), }; static char *snd_pcm_access_names[] = { From f7fc9d47f0ad7cd2202497491ee378ee90460960 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 1 Dec 2023 14:10:21 +0100 Subject: [PATCH 30/54] packet: Move reference count in packet_sock to atomic_long_t commit db3fadacaf0c817b222090290d06ca2a338422d0 upstream. In some potential instances the reference count on struct packet_sock could be saturated and cause overflows which gets the kernel a bit confused. To prevent this, move to a 64-bit atomic reference count on 64-bit architectures to prevent the possibility of this type to overflow. Because we can not handle saturation, using refcount_t is not possible in this place. Maybe someday in the future if it changes it could be used. Also, instead of using plain atomic64_t, use atomic_long_t instead. 32-bit machines tend to be memory-limited (i.e. anything that increases a reference uses so much memory that you can't actually get to 2**32 references). 32-bit architectures also tend to have serious problems with 64-bit atomics. Hence, atomic_long_t is the more natural solution. Reported-by: "The UK's National Cyber Security Centre (NCSC)" Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman Signed-off-by: Daniel Borkmann Cc: Linus Torvalds Cc: stable@kernel.org Reviewed-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231201131021.19999-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 16 ++++++++-------- net/packet/internal.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 39ddfbda804e..377832981178 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4214,7 +4214,7 @@ static void packet_mm_open(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_inc(&pkt_sk(sk)->mapped); + atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) @@ -4224,7 +4224,7 @@ static void packet_mm_close(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_dec(&pkt_sk(sk)->mapped); + atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { @@ -4319,7 +4319,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EBUSY; if (!closing) { - if (atomic_read(&po->mapped)) + if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; @@ -4422,7 +4422,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EBUSY; mutex_lock(&po->pg_vec_lock); - if (closing || atomic_read(&po->mapped) == 0) { + if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); @@ -4440,9 +4440,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); - if (atomic_read(&po->mapped)) - pr_err("packet_mmap: vma is busy: %d\n", - atomic_read(&po->mapped)); + if (atomic_long_read(&po->mapped)) + pr_err("packet_mmap: vma is busy: %ld\n", + atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); @@ -4520,7 +4520,7 @@ static int packet_mmap(struct file *file, struct socket *sock, } } - atomic_inc(&po->mapped); + atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; diff --git a/net/packet/internal.h b/net/packet/internal.h index 3d871cae85b8..7f2d5eed5e00 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -125,7 +125,7 @@ struct packet_sock { __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; - atomic_t mapped; + atomic_long_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; From 5eaa4d20f633b81ee57fc2f45068abec47d75500 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 5 Dec 2023 17:59:47 +0900 Subject: [PATCH 31/54] nilfs2: prevent WARNING in nilfs_sufile_set_segment_usage() commit 675abf8df1353e0e3bde314993e0796c524cfbf0 upstream. If nilfs2 reads a disk image with corrupted segment usage metadata, and its segment usage information is marked as an error for the segment at the write location, nilfs_sufile_set_segment_usage() can trigger WARN_ONs during log writing. Segments newly allocated for writing with nilfs_sufile_alloc() will not have this error flag set, but this unexpected situation will occur if the segment indexed by either nilfs->ns_segnum or nilfs->ns_nextnum (active segment) was marked in error. Fix this issue by inserting a sanity check to treat it as a file system corruption. Since error returns are not allowed during the execution phase where nilfs_sufile_set_segment_usage() is used, this inserts the sanity check into nilfs_sufile_mark_dirty() which pre-reads the buffer containing the segment usage record to be updated and sets it up in a dirty state for writing. In addition, nilfs_sufile_set_segment_usage() is also called when canceling log writing and undoing segment usage update, so in order to avoid issuing the same kernel warning in that case, in case of cancellation, avoid checking the error flag in nilfs_sufile_set_segment_usage(). Link: https://lkml.kernel.org/r/20231205085947.4431-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+14e9f834f6ddecece094@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=14e9f834f6ddecece094 Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/sufile.c | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index d85d3c758d7b..4626540008c1 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -504,15 +504,38 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); - if (!ret) { - mark_buffer_dirty(bh); - nilfs_mdt_mark_dirty(sufile); - kaddr = kmap_atomic(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - nilfs_segment_usage_set_dirty(su); + if (ret) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (unlikely(nilfs_segment_usage_error(su))) { + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + kunmap_atomic(kaddr); brelse(bh); + if (nilfs_segment_is_active(nilfs, segnum)) { + nilfs_error(sufile->i_sb, + "active segment %llu is erroneous", + (unsigned long long)segnum); + } else { + /* + * Segments marked erroneous are never allocated by + * nilfs_sufile_alloc(); only active segments, ie, + * the segments indexed by ns_segnum or ns_nextnum, + * can be erroneous here. + */ + WARN_ON_ONCE(1); + } + ret = -EIO; + } else { + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); + brelse(bh); } +out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } @@ -539,9 +562,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - WARN_ON(nilfs_segment_usage_error(su)); - if (modtime) + if (modtime) { + /* + * Check segusage error and set su_lastmod only when updating + * this entry with a valid timestamp, not for cancellation. + */ + WARN_ON_ONCE(nilfs_segment_usage_error(su)); su->su_lastmod = cpu_to_le64(modtime); + } su->su_nblocks = cpu_to_le32(nblocks); kunmap_atomic(kaddr); From 78f5b3befdbc276488774350ba1aae47213a7d87 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Dec 2023 16:52:09 -0500 Subject: [PATCH 32/54] tracing: Always update snapshot buffer size commit 7be76461f302ec05cbd62b90b2a05c64299ca01f upstream. It use to be that only the top level instance had a snapshot buffer (for latency tracers like wakeup and irqsoff). The update of the ring buffer size would check if the instance was the top level and if so, it would also update the snapshot buffer as it needs to be the same as the main buffer. Now that lower level instances also has a snapshot buffer, they too need to update their snapshot buffer sizes when the main buffer is changed, otherwise the following can be triggered: # cd /sys/kernel/tracing # echo 1500 > buffer_size_kb # mkdir instances/foo # echo irqsoff > instances/foo/current_tracer # echo 1000 > instances/foo/buffer_size_kb Produces: WARNING: CPU: 2 PID: 856 at kernel/trace/trace.c:1938 update_max_tr_single.part.0+0x27d/0x320 Which is: ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { [..] } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); <== here That's because ring_buffer_swap_cpu() has: int ret = -EINVAL; [..] /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; [..] out: return ret; } Instead, update all instances' snapshot buffer sizes when their main buffer size is updated. Link: https://lkml.kernel.org/r/20231205220010.454662151@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a6d84349bcc4..04f4ee81576d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5258,8 +5258,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; #ifdef CONFIG_TRACER_MAX_TRACE - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || - !tr->current_trace->use_max_tr) + if (!tr->current_trace->use_max_tr) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); From 74c00b703eda8cd185e4272ec043956681c8aa9b Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:34 +0100 Subject: [PATCH 33/54] tracing: Fix incomplete locking when disabling buffered events commit 7fed14f7ac9cf5e38c693836fe4a874720141845 upstream. The following warning appears when using buffered events: [ 203.556451] WARNING: CPU: 53 PID: 10220 at kernel/trace/ring_buffer.c:3912 ring_buffer_discard_commit+0x2eb/0x420 [...] [ 203.670690] CPU: 53 PID: 10220 Comm: stress-ng-sysin Tainted: G E 6.7.0-rc2-default #4 56e6d0fcf5581e6e51eaaecbdaec2a2338c80f3a [ 203.670704] Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017 [ 203.670709] RIP: 0010:ring_buffer_discard_commit+0x2eb/0x420 [ 203.735721] Code: 4c 8b 4a 50 48 8b 42 48 49 39 c1 0f 84 b3 00 00 00 49 83 e8 01 75 b1 48 8b 42 10 f0 ff 40 08 0f 0b e9 fc fe ff ff f0 ff 47 08 <0f> 0b e9 77 fd ff ff 48 8b 42 10 f0 ff 40 08 0f 0b e9 f5 fe ff ff [ 203.735734] RSP: 0018:ffffb4ae4f7b7d80 EFLAGS: 00010202 [ 203.735745] RAX: 0000000000000000 RBX: ffffb4ae4f7b7de0 RCX: ffff8ac10662c000 [ 203.735754] RDX: ffff8ac0c750be00 RSI: ffff8ac10662c000 RDI: ffff8ac0c004d400 [ 203.781832] RBP: ffff8ac0c039cea0 R08: 0000000000000000 R09: 0000000000000000 [ 203.781839] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 203.781842] R13: ffff8ac10662c000 R14: ffff8ac0c004d400 R15: ffff8ac10662c008 [ 203.781846] FS: 00007f4cd8a67740(0000) GS:ffff8ad798880000(0000) knlGS:0000000000000000 [ 203.781851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 203.781855] CR2: 0000559766a74028 CR3: 00000001804c4000 CR4: 00000000001506f0 [ 203.781862] Call Trace: [ 203.781870] [ 203.851949] trace_event_buffer_commit+0x1ea/0x250 [ 203.851967] trace_event_raw_event_sys_enter+0x83/0xe0 [ 203.851983] syscall_trace_enter.isra.0+0x182/0x1a0 [ 203.851990] do_syscall_64+0x3a/0xe0 [ 203.852075] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 203.852090] RIP: 0033:0x7f4cd870fa77 [ 203.982920] Code: 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 b8 89 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e9 43 0e 00 f7 d8 64 89 01 48 [ 203.982932] RSP: 002b:00007fff99717dd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000089 [ 203.982942] RAX: ffffffffffffffda RBX: 0000558ea1d7b6f0 RCX: 00007f4cd870fa77 [ 203.982948] RDX: 0000000000000000 RSI: 00007fff99717de0 RDI: 0000558ea1d7b6f0 [ 203.982957] RBP: 00007fff99717de0 R08: 00007fff997180e0 R09: 00007fff997180e0 [ 203.982962] R10: 00007fff997180e0 R11: 0000000000000246 R12: 00007fff99717f40 [ 204.049239] R13: 00007fff99718590 R14: 0000558e9f2127a8 R15: 00007fff997180b0 [ 204.049256] For instance, it can be triggered by running these two commands in parallel: $ while true; do echo hist:key=id.syscall:val=hitcount > \ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger; done $ stress-ng --sysinfo $(nproc) The warning indicates that the current ring_buffer_per_cpu is not in the committing state. It happens because the active ring_buffer_event doesn't actually come from the ring_buffer_per_cpu but is allocated from trace_buffered_event. The bug is in function trace_buffered_event_disable() where the following normally happens: * The code invokes disable_trace_buffered_event() via smp_call_function_many() and follows it by synchronize_rcu(). This increments the per-CPU variable trace_buffered_event_cnt on each target CPU and grants trace_buffered_event_disable() the exclusive access to the per-CPU variable trace_buffered_event. * Maintenance is performed on trace_buffered_event, all per-CPU event buffers get freed. * The code invokes enable_trace_buffered_event() via smp_call_function_many(). This decrements trace_buffered_event_cnt and releases the access to trace_buffered_event. A problem is that smp_call_function_many() runs a given function on all target CPUs except on the current one. The following can then occur: * Task X executing trace_buffered_event_disable() runs on CPU 0. * The control reaches synchronize_rcu() and the task gets rescheduled on another CPU 1. * The RCU synchronization finishes. At this point, trace_buffered_event_disable() has the exclusive access to all trace_buffered_event variables except trace_buffered_event[CPU0] because trace_buffered_event_cnt[CPU0] is never incremented and if the buffer is currently unused, remains set to 0. * A different task Y is scheduled on CPU 0 and hits a trace event. The code in trace_event_buffer_lock_reserve() sees that trace_buffered_event_cnt[CPU0] is set to 0 and decides the use the buffer provided by trace_buffered_event[CPU0]. * Task X continues its execution in trace_buffered_event_disable(). The code incorrectly frees the event buffer pointed by trace_buffered_event[CPU0] and resets the variable to NULL. * Task Y writes event data to the now freed buffer and later detects the created inconsistency. The issue is observable since commit dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()") which moved the call of trace_buffered_event_disable() in __ftrace_event_enable_disable() earlier, prior to invoking call->class->reg(.. TRACE_REG_UNREGISTER ..). The underlying problem in trace_buffered_event_disable() is however present since the original implementation in commit 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events"). Fix the problem by replacing the two smp_call_function_many() calls with on_each_cpu_mask() which invokes a given callback on all CPUs. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-2-petr.pavlu@suse.com Cc: stable@vger.kernel.org Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Fixes: dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 04f4ee81576d..150087920399 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2261,11 +2261,9 @@ void trace_buffered_event_disable(void) if (--trace_buffered_event_ref) return; - preempt_disable(); /* For each CPU, set the buffer as used. */ - smp_call_function_many(tracing_buffer_mask, - disable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); /* Wait for all current users to finish */ synchronize_sched(); @@ -2280,11 +2278,9 @@ void trace_buffered_event_disable(void) */ smp_wmb(); - preempt_disable(); /* Do the work on each cpu */ - smp_call_function_many(tracing_buffer_mask, - enable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); } static struct ring_buffer *temp_buffer; From f718e2cb3915a5ec68d6cee020c2650c7b6fad5e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:36 +0100 Subject: [PATCH 34/54] tracing: Fix a possible race when disabling buffered events commit c0591b1cccf708a47bc465c62436d669a4213323 upstream. Function trace_buffered_event_disable() is responsible for freeing pages backing buffered events and this process can run concurrently with trace_event_buffer_lock_reserve(). The following race is currently possible: * Function trace_buffered_event_disable() is called on CPU 0. It increments trace_buffered_event_cnt on each CPU and waits via synchronize_rcu() for each user of trace_buffered_event to complete. * After synchronize_rcu() is finished, function trace_buffered_event_disable() has the exclusive access to trace_buffered_event. All counters trace_buffered_event_cnt are at 1 and all pointers trace_buffered_event are still valid. * At this point, on a different CPU 1, the execution reaches trace_event_buffer_lock_reserve(). The function calls preempt_disable_notrace() and only now enters an RCU read-side critical section. The function proceeds and reads a still valid pointer from trace_buffered_event[CPU1] into the local variable "entry". However, it doesn't yet read trace_buffered_event_cnt[CPU1] which happens later. * Function trace_buffered_event_disable() continues. It frees trace_buffered_event[CPU1] and decrements trace_buffered_event_cnt[CPU1] back to 0. * Function trace_event_buffer_lock_reserve() continues. It reads and increments trace_buffered_event_cnt[CPU1] from 0 to 1. This makes it believe that it can use the "entry" that it already obtained but the pointer is now invalid and any access results in a use-after-free. Fix the problem by making a second synchronize_rcu() call after all trace_buffered_event values are set to NULL. This waits on all potential users in trace_event_buffer_lock_reserve() that still read a previous pointer from trace_buffered_event. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-4-petr.pavlu@suse.com Cc: stable@vger.kernel.org Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 150087920399..b43d681b072f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2272,13 +2272,17 @@ void trace_buffered_event_disable(void) free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } - /* - * Make sure trace_buffered_event is NULL before clearing - * trace_buffered_event_cnt. - */ - smp_wmb(); - /* Do the work on each cpu */ + /* + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. + */ + synchronize_rcu(); + + /* For each CPU, relinquish the buffer */ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, true); } From ece0857258cbaf20b9828157035999f46ca060c8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 16 Jun 2022 11:06:23 -0700 Subject: [PATCH 35/54] perf/core: Add a new read format to get a number of lost samples [ Upstream commit 119a784c81270eb88e573174ed2209225d646656 ] Sometimes we want to know an accurate number of samples even if it's lost. Currenlty PERF_RECORD_LOST is generated for a ring-buffer which might be shared with other events. So it's hard to know per-event lost count. Add event->lost_samples field and PERF_FORMAT_LOST to retrieve it from userspace. Original-patch-by: Jiri Olsa Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220616180623.1358843-1-namhyung@kernel.org Stable-dep-of: 382c27f4ed28 ("perf: Fix perf_event_validate_size()") Signed-off-by: Sasha Levin --- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 5 ++++- kernel/events/core.c | 21 ++++++++++++++++++--- kernel/events/ring_buffer.c | 5 ++++- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d46eeddeb859..123e4d715c98 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -685,6 +685,8 @@ struct perf_event { struct pid_namespace *ns; u64 id; + atomic64_t lost_samples; + u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 5fb4cdf37100..140024b2db38 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -273,6 +273,7 @@ enum { * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } && !PERF_FORMAT_GROUP * * { u64 nr; @@ -280,6 +281,7 @@ enum { * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 value; * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } cntr[nr]; * } && PERF_FORMAT_GROUP * }; @@ -289,8 +291,9 @@ enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3c8eb5d84214..acc6403fc87b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1725,6 +1725,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings) if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); @@ -4816,11 +4819,15 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); } unlock: @@ -4874,7 +4881,7 @@ static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); @@ -4884,6 +4891,8 @@ static int perf_read_one(struct perf_event *event, values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -6193,7 +6202,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = perf_event_count(event); @@ -6207,6 +6216,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } @@ -6217,7 +6228,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; - u64 values[5]; + u64 values[6]; int n = 0; values[n++] = 1 + leader->nr_siblings; @@ -6235,6 +6246,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); @@ -6248,6 +6261,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2f6f77658eba..ddcbd03eccbb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -165,8 +165,10 @@ __perf_output_begin(struct perf_output_handle *handle, goto out; if (unlikely(rb->paused)) { - if (rb->nr_pages) + if (rb->nr_pages) { local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); + } goto out; } @@ -249,6 +251,7 @@ __perf_output_begin(struct perf_output_handle *handle, fail: local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); perf_output_put_handle(handle); out: rcu_read_unlock(); From f5d6ab016792c9d6d5280fdb7f8962eb3b8c620e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Nov 2023 15:24:52 +0100 Subject: [PATCH 36/54] perf: Fix perf_event_validate_size() [ Upstream commit 382c27f4ed28f803b1f1473ac2d8db0afc795a1b ] Budimir noted that perf_event_validate_size() only checks the size of the newly added event, even though the sizes of all existing events can also change due to not all events having the same read_format. When we attach the new event, perf_group_attach(), we do re-compute the size for all events. Fixes: a723968c0ed3 ("perf: Fix u16 overflows") Reported-by: Budimir Markovic Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sasha Levin --- kernel/events/core.c | 61 +++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index acc6403fc87b..4182e265176d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1710,31 +1710,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_LOST) + if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1775,8 +1778,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1807,24 +1811,35 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; - /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. - */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) return false; + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } + return true; } From ba0bc70d49c82803f2b793cf003d925e2424c570 Mon Sep 17 00:00:00 2001 From: Boerge Struempfel Date: Wed, 29 Nov 2023 16:23:07 +0100 Subject: [PATCH 37/54] gpiolib: sysfs: Fix error handling on failed export [ Upstream commit 95dd1e34ff5bbee93a28ff3947eceaf6de811b1a ] If gpio_set_transitory() fails, we should free the GPIO again. Most notably, the flag FLAG_REQUESTED has previously been set in gpiod_request_commit(), and should be reset on failure. To my knowledge, this does not affect any current users, since the gpio_set_transitory() mainly returns 0 and -ENOTSUPP, which is converted to 0. However the gpio_set_transitory() function calles the .set_config() function of the corresponding GPIO chip and there are some GPIO drivers in which some (unlikely) branches return other values like -EPROBE_DEFER, and -EINVAL. In these cases, the above mentioned FLAG_REQUESTED would not be reset, which results in the pin being blocked until the next reboot. Fixes: e10f72bf4b3e ("gpio: gpiolib: Generalise state persistence beyond sleep") Signed-off-by: Boerge Struempfel Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpiolib-sysfs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c index e0ccc79b239a..7a8dfc36a6c1 100644 --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c @@ -495,14 +495,17 @@ static ssize_t export_store(struct class *class, } status = gpiod_set_transitory(desc, false); - if (!status) { - status = gpiod_export(desc, true); - if (status < 0) - gpiod_free(desc); - else - set_bit(FLAG_SYSFS, &desc->flags); + if (status) { + gpiod_free(desc); + goto done; } + status = gpiod_export(desc, true); + if (status < 0) + gpiod_free(desc); + else + set_bit(FLAG_SYSFS, &desc->flags); + done: if (status) pr_debug("%s: status %d\n", __func__, status); From 7d357f053a9bb8d52b3b529029051dbbf9fef5a0 Mon Sep 17 00:00:00 2001 From: Konstantin Aladyshev Date: Wed, 6 Dec 2023 11:07:44 +0300 Subject: [PATCH 38/54] usb: gadget: f_hid: fix report descriptor allocation commit 61890dc28f7d9e9aac8a9471302613824c22fae4 upstream. The commit 89ff3dfac604 ("usb: gadget: f_hid: fix f_hidg lifetime vs cdev") has introduced a bug that leads to hid device corruption after the replug operation. Reverse device managed memory allocation for the report descriptor to fix the issue. Tested: This change was tested on the AMD EthanolX CRB server with the BMC based on the OpenBMC distribution. The BMC provides KVM functionality via the USB gadget device: - before: KVM page refresh results in a broken USB device, - after: KVM page refresh works without any issues. Fixes: 89ff3dfac604 ("usb: gadget: f_hid: fix f_hidg lifetime vs cdev") Cc: stable@vger.kernel.org Signed-off-by: Konstantin Aladyshev Link: https://lore.kernel.org/r/20231206080744.253-2-aladyshev22@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_hid.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index ea877cba0763..b18d44f04e37 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c @@ -88,6 +88,7 @@ static void hidg_release(struct device *dev) { struct f_hidg *hidg = container_of(dev, struct f_hidg, dev); + kfree(hidg->report_desc); kfree(hidg->set_report_buf); kfree(hidg); } @@ -1293,9 +1294,9 @@ static struct usb_function *hidg_alloc(struct usb_function_instance *fi) hidg->report_length = opts->report_length; hidg->report_desc_length = opts->report_desc_length; if (opts->report_desc) { - hidg->report_desc = devm_kmemdup(&hidg->dev, opts->report_desc, - opts->report_desc_length, - GFP_KERNEL); + hidg->report_desc = kmemdup(opts->report_desc, + opts->report_desc_length, + GFP_KERNEL); if (!hidg->report_desc) { put_device(&hidg->dev); --opts->refcnt; From cfa4fe60a9a1e5ac15e7219b6774a62d05fcc611 Mon Sep 17 00:00:00 2001 From: Cameron Williams Date: Thu, 2 Nov 2023 21:10:40 +0000 Subject: [PATCH 39/54] parport: Add support for Brainboxes IX/UC/PX parallel cards commit 1a031f6edc460e9562098bdedc3918da07c30a6e upstream. Adds support for Intashield IX-500/IX-550, UC-146/UC-157, PX-146/PX-157, PX-203 and PX-475 (LPT port) Cc: stable@vger.kernel.org Signed-off-by: Cameron Williams Acked-by: Sudip Mukherjee Link: https://lore.kernel.org/r/AS4PR02MB790389C130410BD864C8DCC9C4A6A@AS4PR02MB7903.eurprd02.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/parport/parport_pc.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c index d99ac73a1d89..c34ad5dd62e3 100644 --- a/drivers/parport/parport_pc.c +++ b/drivers/parport/parport_pc.c @@ -2647,6 +2647,8 @@ enum parport_pc_pci_cards { netmos_9865, quatech_sppxp100, wch_ch382l, + brainboxes_uc146, + brainboxes_px203, }; @@ -2710,6 +2712,8 @@ static struct parport_pc_pci { /* netmos_9865 */ { 1, { { 0, -1 }, } }, /* quatech_sppxp100 */ { 1, { { 0, 1 }, } }, /* wch_ch382l */ { 1, { { 2, -1 }, } }, + /* brainboxes_uc146 */ { 1, { { 3, -1 }, } }, + /* brainboxes_px203 */ { 1, { { 0, -1 }, } }, }; static const struct pci_device_id parport_pc_pci_tbl[] = { @@ -2801,6 +2805,23 @@ static const struct pci_device_id parport_pc_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 }, /* WCH CH382L PCI-E single parallel port card */ { 0x1c00, 0x3050, 0x1c00, 0x3050, 0, 0, wch_ch382l }, + /* Brainboxes IX-500/550 */ + { PCI_VENDOR_ID_INTASHIELD, 0x402a, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport }, + /* Brainboxes UC-146/UC-157 */ + { PCI_VENDOR_ID_INTASHIELD, 0x0be1, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc146 }, + { PCI_VENDOR_ID_INTASHIELD, 0x0be2, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_uc146 }, + /* Brainboxes PX-146/PX-257 */ + { PCI_VENDOR_ID_INTASHIELD, 0x401c, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport }, + /* Brainboxes PX-203 */ + { PCI_VENDOR_ID_INTASHIELD, 0x4007, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, brainboxes_px203 }, + /* Brainboxes PX-475 */ + { PCI_VENDOR_ID_INTASHIELD, 0x401f, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport }, { 0, } /* terminate list */ }; MODULE_DEVICE_TABLE(pci, parport_pc_pci_tbl); From 3ac138895835d927ca01d9c891b9e7b3f41cf46b Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Wed, 29 Nov 2023 19:23:50 +0000 Subject: [PATCH 40/54] usb: typec: class: fix typec_altmode_put_partner to put plugs commit b17b7fe6dd5c6ff74b38b0758ca799cdbb79e26e upstream. When typec_altmode_put_partner is called by a plug altmode upon release, the port altmode the plug belongs to will not remove its reference to the plug. The check to see if the altmode being released evaluates against the released altmode's partner instead of the calling altmode itself, so change adev in typec_altmode_put_partner to properly refer to the altmode being released. typec_altmode_set_partner is not run for port altmodes, so also add a check in typec_altmode_release to prevent typec_altmode_put_partner() calls on port altmode release. Fixes: 8a37d87d72f0 ("usb: typec: Bus type for alternate modes") Cc: stable@vger.kernel.org Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20231129192349.1773623-2-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 2ede329ce594..e789f4a7553a 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -192,7 +192,7 @@ static void typec_altmode_put_partner(struct altmode *altmode) if (!partner) return; - adev = &partner->adev; + adev = &altmode->adev; if (is_typec_plug(adev->dev.parent)) { struct typec_plug *plug = to_typec_plug(adev->dev.parent); @@ -459,7 +459,8 @@ static void typec_altmode_release(struct device *dev) { struct altmode *alt = to_altmode(to_typec_altmode(dev)); - typec_altmode_put_partner(alt); + if (!is_typec_port(dev->parent)) + typec_altmode_put_partner(alt); altmode_id_remove(alt->adev.dev.parent, alt->id); kfree(alt); From acc97466c09d5673445f6f5e655e090e4bfa53db Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 23 Nov 2023 08:28:18 +0100 Subject: [PATCH 41/54] serial: sc16is7xx: address RX timeout interrupt errata commit 08ce9a1b72e38cf44c300a44ac5858533eb3c860 upstream. This device has a silicon bug that makes it report a timeout interrupt but no data in the FIFO. The datasheet states the following in the errata section 18.1.4: "If the host reads the receive FIFO at the same time as a time-out interrupt condition happens, the host might read 0xCC (time-out) in the Interrupt Indication Register (IIR), but bit 0 of the Line Status Register (LSR) is not set (means there is no data in the receive FIFO)." The errata description seems to indicate it concerns only polled mode of operation when reading bit 0 of the LSR register. However, tests have shown and NXP has confirmed that the RXLVL register also yields 0 when the bug is triggered, and hence the IRQ driven implementation in this driver is equally affected. This bug has hit us on production units and when it does, sc16is7xx_irq() would spin forever because sc16is7xx_port_irq() keeps seeing an interrupt in the IIR register that is not cleared because the driver does not call into sc16is7xx_handle_rx() unless the RXLVL register reports at least one byte in the FIFO. Fix this by always reading one byte from the FIFO when this condition is detected in order to clear the interrupt. This approach was confirmed to be correct by NXP through their support channels. Tested by: Hugo Villeneuve Signed-off-by: Daniel Mack Co-Developed-by: Maxim Popov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231123072818.1394539-1-daniel@zonque.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index d4496a44abdf..c65f9194abb0 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -694,6 +694,18 @@ static bool sc16is7xx_port_irq(struct sc16is7xx_port *s, int portno) case SC16IS7XX_IIR_RTOI_SRC: case SC16IS7XX_IIR_XOFFI_SRC: rxlen = sc16is7xx_port_read(port, SC16IS7XX_RXLVL_REG); + + /* + * There is a silicon bug that makes the chip report a + * time-out interrupt but no data in the FIFO. This is + * described in errata section 18.1.4. + * + * When this happens, read one byte from the FIFO to + * clear the interrupt. + */ + if (iir == SC16IS7XX_IIR_RTOI_SRC && !rxlen) + rxlen = 1; + if (rxlen) sc16is7xx_handle_rx(port, rxlen, iir); break; From 99e32a666a7a1810ae5feb691208a2271b1d8b92 Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Tue, 31 Oct 2023 14:12:42 +0100 Subject: [PATCH 42/54] serial: 8250_omap: Add earlycon support for the AM654 UART controller commit 8e42c301ce64e0dcca547626eb486877d502d336 upstream. Currently there is no support for earlycon on the AM654 UART controller. This commit adds it. Signed-off-by: Ronald Wahl Reviewed-by: Vignesh Raghavendra Link: https://lore.kernel.org/r/20231031131242.15516-1-rwahl@gmx.de Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_early.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 5cd8c36c8fcc..2acaea5f3232 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -176,6 +176,7 @@ static int __init early_omap8250_setup(struct earlycon_device *device, OF_EARLYCON_DECLARE(omap8250, "ti,omap2-uart", early_omap8250_setup); OF_EARLYCON_DECLARE(omap8250, "ti,omap3-uart", early_omap8250_setup); OF_EARLYCON_DECLARE(omap8250, "ti,omap4-uart", early_omap8250_setup); +OF_EARLYCON_DECLARE(omap8250, "ti,am654-uart", early_omap8250_setup); #endif From 5ac3c7a8300e87ae0807e4c31d06b0a014eaa2d0 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 1 Dec 2023 19:37:27 +0100 Subject: [PATCH 43/54] x86/CPU/AMD: Check vendor in the AMD microcode callback commit 9b8493dc43044376716d789d07699f17d538a7c4 upstream. Commit in Fixes added an AMD-specific microcode callback. However, it didn't check the CPU vendor the kernel runs on explicitly. The only reason the Zenbleed check in it didn't run on other x86 vendors hardware was pure coincidental luck: if (!cpu_has_amd_erratum(c, amd_zenbleed)) return; gives true on other vendors because they don't have those families and models. However, with the removal of the cpu_has_amd_erratum() in 05f5f73936fa ("x86/CPU/AMD: Drop now unused CPU erratum checking function") that coincidental condition is gone, leading to the zenbleed check getting executed on other vendors too. Add the explicit vendor check for the whole callback as it should've been done in the first place. Fixes: 522b1d69219d ("x86/cpu/amd: Add a Zenbleed fix") Cc: Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231201184226.16749-1-bp@alien8.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/amd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dc41d4d7836e..84667781c41d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1253,5 +1253,8 @@ static void zenbleed_check_cpu(void *unused) void amd_check_microcode(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + on_each_cpu(zenbleed_check_cpu, NULL, 1); } From dfbacbe69bcca147809569eb5de8da4bc46e89ae Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 9 Nov 2023 13:36:24 +0100 Subject: [PATCH 44/54] KVM: s390/mm: Properly reset no-dat commit 27072b8e18a73ffeffb1c140939023915a35134b upstream. When the CMMA state needs to be reset, the no-dat bit also needs to be reset. Failure to do so could cause issues in the guest, since the guest expects the bit to be cleared after a reset. Cc: Reviewed-by: Nico Boehr Message-ID: <20231109123624.37314-1-imbrenda@linux.ibm.com> Signed-off-by: Claudio Imbrenda Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9f3903089869..758e8e9eee4f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -699,7 +699,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_clear(mm, addr, ptep); } if (reset) - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgste_set_unlock(ptep, pgste); preempt_enable(); } From 4ddf6f18350dbdfad807e32d5b0cc58aff3500d6 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 29 Nov 2023 23:15:47 +0900 Subject: [PATCH 45/54] nilfs2: fix missing error check for sb_set_blocksize call commit d61d0ab573649789bf9eb909c89a1a193b2e3d10 upstream. When mounting a filesystem image with a block size larger than the page size, nilfs2 repeatedly outputs long error messages with stack traces to the kernel log, such as the following: getblk(): invalid block size 8192 requested logical block size: 512 ... Call Trace: dump_stack_lvl+0x92/0xd4 dump_stack+0xd/0x10 bdev_getblk+0x33a/0x354 __breadahead+0x11/0x80 nilfs_search_super_root+0xe2/0x704 [nilfs2] load_nilfs+0x72/0x504 [nilfs2] nilfs_mount+0x30f/0x518 [nilfs2] legacy_get_tree+0x1b/0x40 vfs_get_tree+0x18/0xc4 path_mount+0x786/0xa88 __ia32_sys_mount+0x147/0x1a8 __do_fast_syscall_32+0x56/0xc8 do_fast_syscall_32+0x29/0x58 do_SYSENTER_32+0x15/0x18 entry_SYSENTER_32+0x98/0xf1 ... This overloads the system logger. And to make matters worse, it sometimes crashes the kernel with a memory access violation. This is because the return value of the sb_set_blocksize() call, which should be checked for errors, is not checked. The latter issue is due to out-of-buffer memory being accessed based on a large block size that caused sb_set_blocksize() to fail for buffers read with the initial minimum block size that remained unupdated in the super_block structure. Since nilfs2 mkfs tool does not accept block sizes larger than the system page size, this has been overlooked. However, it is possible to create this situation by intentionally modifying the tool or by passing a filesystem image created on a system with a large page size to a system with a smaller page size and mounting it. Fix this issue by inserting the expected error handling for the call to sb_set_blocksize(). Link: https://lkml.kernel.org/r/20231129141547.4726-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/the_nilfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d550a564645e..c8d869bc25b0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -688,7 +688,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) goto failed_sbh; } nilfs_release_super_block(nilfs); - sb_set_blocksize(sb, blocksize); + if (!sb_set_blocksize(sb, blocksize)) { + nilfs_msg(sb, KERN_ERR, "bad blocksize %d", blocksize); + err = -EINVAL; + goto out; + } err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) From 7c62ae9b2266b5edd41fc37c322a3185ea69674e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 11 Dec 2023 14:42:19 +0200 Subject: [PATCH 46/54] netlink: don't call ->netlink_bind with table lock held From: Florian Westphal commit f2764bd4f6a8dffaec3e220728385d9756b3c2cb upstream. When I added support to allow generic netlink multicast groups to be restricted to subscribers with CAP_NET_ADMIN I was unaware that a genl_bind implementation already existed in the past. It was reverted due to ABBA deadlock: 1. ->netlink_bind gets called with the table lock held. 2. genetlink bind callback is invoked, it grabs the genl lock. But when a new genl subsystem is (un)registered, these two locks are taken in reverse order. One solution would be to revert again and add a comment in genl referring 1e82a62fec613, "genetlink: remove genl_bind"). This would need a second change in mptcp to not expose the raw token value anymore, e.g. by hashing the token with a secret key so userspace can still associate subflow events with the correct mptcp connection. However, Paolo Abeni reminded me to double-check why the netlink table is locked in the first place. I can't find one. netlink_bind() is already called without this lock when userspace joins a group via NETLINK_ADD_MEMBERSHIP setsockopt. Same holds for the netlink_unbind operation. Digging through the history, commit f773608026ee1 ("netlink: access nlk groups safely in netlink bind and getname") expanded the lock scope. commit 3a20773beeeeade ("net: netlink: cap max groups which will be considered in netlink_bind()") ... removed the nlk->ngroups access that the lock scope extension was all about. Reduce the lock scope again and always call ->netlink_bind without the table lock. The Fixes tag should be vs. the patch mentioned in the link below, but that one got squash-merged into the patch that came earlier in the series. Fixes: 4d54cc32112d8d ("mptcp: avoid lock_fast usage in accept path") Link: https://lore.kernel.org/mptcp/20210213000001.379332-8-mathew.j.martineau@linux.intel.com/T/#u Cc: Cong Wang Cc: Xin Long Cc: Johannes Berg Cc: Sean Tranchetti Cc: Paolo Abeni Cc: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Ido Schimmel Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 35ecaa93f213..e91489b3274c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1030,7 +1030,6 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } - netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -1042,13 +1041,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - goto unlock; + return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ + netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : From 30cc13fe89401588a52a0aa148dceff8f3796486 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 11 Dec 2023 14:42:20 +0200 Subject: [PATCH 47/54] genetlink: add CAP_NET_ADMIN test for multicast bind This is a partial backport of upstream commit 4d54cc32112d ("mptcp: avoid lock_fast usage in accept path"). It is only a partial backport because the patch in the link below was erroneously squash-merged into upstream commit 4d54cc32112d ("mptcp: avoid lock_fast usage in accept path"). Below is the original patch description from Florian Westphal: " genetlink sets NL_CFG_F_NONROOT_RECV for its netlink socket so anyone can subscribe to multicast messages. rtnetlink doesn't allow this unconditionally, rtnetlink_bind() restricts bind requests to CAP_NET_ADMIN for a few groups. This allows to set GENL_UNS_ADMIN_PERM flag on genl mcast groups to mandate CAP_NET_ADMIN. This will be used by the upcoming mptcp netlink event facility which exposes the token (mptcp connection identifier) to userspace. " Link: https://lore.kernel.org/mptcp/20210213000001.379332-8-mathew.j.martineau@linux.intel.com/ Signed-off-by: Ido Schimmel Signed-off-by: Greg Kroah-Hartman --- include/net/genetlink.h | 1 + net/netlink/genetlink.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 3e3a1a38884a..5a1539aa963d 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -14,6 +14,7 @@ */ struct genl_multicast_group { char name[GENL_NAMSIZ]; + u8 flags; }; struct genl_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index ede73ecfb1f5..2f4c2b35a386 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -961,11 +961,43 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; +static int genl_bind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + int ret = 0; + + genl_lock_all(); + + idr_for_each_entry(&genl_fam_idr, family, id) { + const struct genl_multicast_group *grp; + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + grp = &family->mcgrps[i]; + if ((grp->flags & GENL_UNS_ADMIN_PERM) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + + break; + } + + genl_unlock_all(); + return ret; +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, }; /* we'll bump the group number right afterwards */ From ac38a8b34cd07ebe8b16c766f0f302129b5a979d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 11 Dec 2023 14:42:21 +0200 Subject: [PATCH 48/54] psample: Require 'CAP_NET_ADMIN' when joining "packets" group commit 44ec98ea5ea9cfecd31a5c4cc124703cb5442832 upstream. The "psample" generic netlink family notifies sampled packets over the "packets" multicast group. This is problematic since by default generic netlink allows non-root users to listen to these notifications. Fix by marking the group with the 'GENL_UNS_ADMIN_PERM' flag. This will prevent non-root users or root without the 'CAP_NET_ADMIN' capability (in the user namespace owning the network namespace) from joining the group. Tested using [1]. Before: # capsh -- -c ./psample_repo # capsh --drop=cap_net_admin -- -c ./psample_repo After: # capsh -- -c ./psample_repo # capsh --drop=cap_net_admin -- -c ./psample_repo Failed to join "packets" multicast group [1] $ cat psample.c #include #include #include #include int join_grp(struct nl_sock *sk, const char *grp_name) { int grp, err; grp = genl_ctrl_resolve_grp(sk, "psample", grp_name); if (grp < 0) { fprintf(stderr, "Failed to resolve \"%s\" multicast group\n", grp_name); return grp; } err = nl_socket_add_memberships(sk, grp, NFNLGRP_NONE); if (err) { fprintf(stderr, "Failed to join \"%s\" multicast group\n", grp_name); return err; } return 0; } int main(int argc, char **argv) { struct nl_sock *sk; int err; sk = nl_socket_alloc(); if (!sk) { fprintf(stderr, "Failed to allocate socket\n"); return -1; } err = genl_connect(sk); if (err) { fprintf(stderr, "Failed to connect socket\n"); return err; } err = join_grp(sk, "config"); if (err) return err; err = join_grp(sk, "packets"); if (err) return err; return 0; } $ gcc -I/usr/include/libnl3 -lnl-3 -lnl-genl-3 -o psample_repo psample.c Fixes: 6ae0a6286171 ("net: Introduce psample, a new genetlink channel for packet sampling") Reported-by: "The UK's National Cyber Security Centre (NCSC)" Signed-off-by: Ido Schimmel Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231206213102.1824398-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/psample/psample.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index 30e8239bd774..196fbf674dc1 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -31,7 +31,8 @@ enum psample_nl_multicast_groups { static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, - [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME }, + [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, + .flags = GENL_UNS_ADMIN_PERM }, }; static struct genl_family psample_nl_family __ro_after_init; From 855a2b559df16b0cf7b5dcc1a31042347afcabf2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 11 Dec 2023 14:42:22 +0200 Subject: [PATCH 49/54] drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group commit e03781879a0d524ce3126678d50a80484a513c4b upstream. The "NET_DM" generic netlink family notifies drop locations over the "events" multicast group. This is problematic since by default generic netlink allows non-root users to listen to these notifications. Fix by adding a new field to the generic netlink multicast group structure that when set prevents non-root users or root without the 'CAP_SYS_ADMIN' capability (in the user namespace owning the network namespace) from joining the group. Set this field for the "events" group. Use 'CAP_SYS_ADMIN' rather than 'CAP_NET_ADMIN' because of the nature of the information that is shared over this group. Note that the capability check in this case will always be performed against the initial user namespace since the family is not netns aware and only operates in the initial network namespace. A new field is added to the structure rather than using the "flags" field because the existing field uses uAPI flags and it is inappropriate to add a new uAPI flag for an internal kernel check. In net-next we can rework the "flags" field to use internal flags and fold the new field into it. But for now, in order to reduce the amount of changes, add a new field. Since the information can only be consumed by root, mark the control plane operations that start and stop the tracing as root-only using the 'GENL_ADMIN_PERM' flag. Tested using [1]. Before: # capsh -- -c ./dm_repo # capsh --drop=cap_sys_admin -- -c ./dm_repo After: # capsh -- -c ./dm_repo # capsh --drop=cap_sys_admin -- -c ./dm_repo Failed to join "events" multicast group [1] $ cat dm.c #include #include #include #include int main(int argc, char **argv) { struct nl_sock *sk; int grp, err; sk = nl_socket_alloc(); if (!sk) { fprintf(stderr, "Failed to allocate socket\n"); return -1; } err = genl_connect(sk); if (err) { fprintf(stderr, "Failed to connect socket\n"); return err; } grp = genl_ctrl_resolve_grp(sk, "NET_DM", "events"); if (grp < 0) { fprintf(stderr, "Failed to resolve \"events\" multicast group\n"); return grp; } err = nl_socket_add_memberships(sk, grp, NFNLGRP_NONE); if (err) { fprintf(stderr, "Failed to join \"events\" multicast group\n"); return err; } return 0; } $ gcc -I/usr/include/libnl3 -lnl-3 -lnl-genl-3 -o dm_repo dm.c Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Reported-by: "The UK's National Cyber Security Centre (NCSC)" Signed-off-by: Ido Schimmel Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231206213102.1824398-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/genetlink.h | 2 ++ net/core/drop_monitor.c | 4 +++- net/netlink/genetlink.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 5a1539aa963d..b8b7edcfc893 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -11,10 +11,12 @@ /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family + * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; + u8 cap_sys_admin:1; }; struct genl_ops; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2ed600012640..52e559252a9e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -122,7 +122,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", }, + { .name = "events", .cap_sys_admin = 1 }, }; static void send_dm_alert(struct work_struct *work) @@ -370,10 +370,12 @@ static const struct genl_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_START, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2f4c2b35a386..f449be93b375 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -984,6 +984,9 @@ static int genl_bind(struct net *net, int group) if ((grp->flags & GENL_UNS_ADMIN_PERM) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; + if (grp->cap_sys_admin && + !ns_capable(net->user_ns, CAP_SYS_ADMIN)) + ret = -EPERM; break; } From 5cff0311825c90e4ead805f338cd754c72e07fa0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Aug 2022 17:36:41 -0700 Subject: [PATCH 50/54] tools headers UAPI: Sync linux/perf_event.h with the kernel sources commit 65ba872a6971c11ceb342c3330f059289c0e6bdb upstream. To pick the trivial change in: 119a784c81270eb8 ("perf/core: Add a new read format to get a number of lost samples") Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220819003644.508916-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/include/uapi/linux/perf_event.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index a45e7b4f0316..3e3c303bdda5 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -273,6 +273,7 @@ enum { * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } && !PERF_FORMAT_GROUP * * { u64 nr; @@ -280,6 +281,7 @@ enum { * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 value; * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } cntr[nr]; * } && PERF_FORMAT_GROUP * }; @@ -289,8 +291,9 @@ enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ From 2e0dfb566559da99fa5e81bc1caafe45952abff8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 4 Sep 2020 12:50:39 -0700 Subject: [PATCH 51/54] IB/isert: Fix unaligned immediate-data handling commit 0b089c1ef7047652b13b4cdfdb1e0e7dbdb8c9ab upstream. Currently we allocate rx buffers in a single contiguous buffers for headers (iser and iscsi) and data trailer. This means that most likely the data starting offset is aligned to 76 bytes (size of both headers). This worked fine for years, but at some point this broke, resulting in data corruptions in isert when a command comes with immediate data and the underlying backend device assumes 512 bytes buffer alignment. We assume a hard-requirement for all direct I/O buffers to be 512 bytes aligned. To fix this, we should avoid passing unaligned buffers for I/O. Instead, we allocate our recv buffers with some extra space such that we can have the data portion align to 512 byte boundary. This also means that we cannot reference headers or data using structure but rather accessors (as they may move based on alignment). Also, get rid of the wrong __packed annotation from iser_rx_desc as this has only harmful effects (not aligned to anything). This affects the rx descriptors for iscsi login and data plane. Fixes: 3d75ca0adef4 ("block: introduce multi-page bvec helpers") Link: https://lore.kernel.org/r/20200904195039.31687-1-sagi@grimberg.me Reported-by: Stephen Rust Tested-by: Doug Dumitru Signed-off-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/isert/ib_isert.c | 93 +++++++++++++------------ drivers/infiniband/ulp/isert/ib_isert.h | 41 ++++++++--- 2 files changed, 78 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 60594dad5545..d7b7b77e4d65 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -190,15 +190,15 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn) rx_desc = isert_conn->rx_descs; for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { - dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + dma_addr = ib_dma_map_single(ib_dev, rx_desc->buf, + ISER_RX_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ib_dev, dma_addr)) goto dma_map_fail; rx_desc->dma_addr = dma_addr; rx_sg = &rx_desc->rx_sg; - rx_sg->addr = rx_desc->dma_addr; + rx_sg->addr = rx_desc->dma_addr + isert_get_hdr_offset(rx_desc); rx_sg->length = ISER_RX_PAYLOAD_SIZE; rx_sg->lkey = device->pd->local_dma_lkey; rx_desc->rx_cqe.done = isert_recv_done; @@ -210,7 +210,7 @@ dma_map_fail: rx_desc = isert_conn->rx_descs; for (j = 0; j < i; j++, rx_desc++) { ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_SIZE, DMA_FROM_DEVICE); } kfree(isert_conn->rx_descs); isert_conn->rx_descs = NULL; @@ -231,7 +231,7 @@ isert_free_rx_descriptors(struct isert_conn *isert_conn) rx_desc = isert_conn->rx_descs; for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_SIZE, DMA_FROM_DEVICE); } kfree(isert_conn->rx_descs); @@ -416,10 +416,9 @@ isert_free_login_buf(struct isert_conn *isert_conn) ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE); kfree(isert_conn->login_rsp_buf); - ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, - ISER_RX_PAYLOAD_SIZE, - DMA_FROM_DEVICE); - kfree(isert_conn->login_req_buf); + ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + kfree(isert_conn->login_desc); } static int @@ -428,25 +427,25 @@ isert_alloc_login_buf(struct isert_conn *isert_conn, { int ret; - isert_conn->login_req_buf = kzalloc(sizeof(*isert_conn->login_req_buf), + isert_conn->login_desc = kzalloc(sizeof(*isert_conn->login_desc), GFP_KERNEL); - if (!isert_conn->login_req_buf) + if (!isert_conn->login_desc) return -ENOMEM; - isert_conn->login_req_dma = ib_dma_map_single(ib_dev, - isert_conn->login_req_buf, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); + isert_conn->login_desc->dma_addr = ib_dma_map_single(ib_dev, + isert_conn->login_desc->buf, + ISER_RX_SIZE, DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_desc->dma_addr); if (ret) { - isert_err("login_req_dma mapping error: %d\n", ret); - isert_conn->login_req_dma = 0; - goto out_free_login_req_buf; + isert_err("login_desc dma mapping error: %d\n", ret); + isert_conn->login_desc->dma_addr = 0; + goto out_free_login_desc; } isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL); if (!isert_conn->login_rsp_buf) { ret = -ENOMEM; - goto out_unmap_login_req_buf; + goto out_unmap_login_desc; } isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, @@ -463,11 +462,11 @@ isert_alloc_login_buf(struct isert_conn *isert_conn, out_free_login_rsp_buf: kfree(isert_conn->login_rsp_buf); -out_unmap_login_req_buf: - ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); -out_free_login_req_buf: - kfree(isert_conn->login_req_buf); +out_unmap_login_desc: + ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); +out_free_login_desc: + kfree(isert_conn->login_desc); return ret; } @@ -586,7 +585,7 @@ isert_connect_release(struct isert_conn *isert_conn) ib_destroy_qp(isert_conn->qp); } - if (isert_conn->login_req_buf) + if (isert_conn->login_desc) isert_free_login_buf(isert_conn); isert_device_put(device); @@ -976,17 +975,18 @@ isert_login_post_recv(struct isert_conn *isert_conn) int ret; memset(&sge, 0, sizeof(struct ib_sge)); - sge.addr = isert_conn->login_req_dma; + sge.addr = isert_conn->login_desc->dma_addr + + isert_get_hdr_offset(isert_conn->login_desc); sge.length = ISER_RX_PAYLOAD_SIZE; sge.lkey = isert_conn->device->pd->local_dma_lkey; isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n", sge.addr, sge.length, sge.lkey); - isert_conn->login_req_buf->rx_cqe.done = isert_login_recv_done; + isert_conn->login_desc->rx_cqe.done = isert_login_recv_done; memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); - rx_wr.wr_cqe = &isert_conn->login_req_buf->rx_cqe; + rx_wr.wr_cqe = &isert_conn->login_desc->rx_cqe; rx_wr.sg_list = &sge; rx_wr.num_sge = 1; @@ -1063,7 +1063,7 @@ post_send: static void isert_rx_login_req(struct isert_conn *isert_conn) { - struct iser_rx_desc *rx_desc = isert_conn->login_req_buf; + struct iser_rx_desc *rx_desc = isert_conn->login_desc; int rx_buflen = isert_conn->login_req_len; struct iscsi_conn *conn = isert_conn->conn; struct iscsi_login *login = conn->conn_login; @@ -1075,7 +1075,7 @@ isert_rx_login_req(struct isert_conn *isert_conn) if (login->first_request) { struct iscsi_login_req *login_req = - (struct iscsi_login_req *)&rx_desc->iscsi_header; + (struct iscsi_login_req *)isert_get_iscsi_hdr(rx_desc); /* * Setup the initial iscsi_login values from the leading * login request PDU. @@ -1094,13 +1094,13 @@ isert_rx_login_req(struct isert_conn *isert_conn) login->tsih = be16_to_cpu(login_req->tsih); } - memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN); + memcpy(&login->req[0], isert_get_iscsi_hdr(rx_desc), ISCSI_HDR_LEN); size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); isert_dbg("Using login payload size: %d, rx_buflen: %d " "MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen, MAX_KEY_VALUE_PAIRS); - memcpy(login->req_buf, &rx_desc->data[0], size); + memcpy(login->req_buf, isert_get_data(rx_desc), size); if (login->first_request) { complete(&isert_conn->login_comp); @@ -1165,14 +1165,15 @@ isert_handle_scsi_cmd(struct isert_conn *isert_conn, if (imm_data_len != data_len) { sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents, - &rx_desc->data[0], imm_data_len); + isert_get_data(rx_desc), imm_data_len); isert_dbg("Copy Immediate sg_nents: %u imm_data_len: %d\n", sg_nents, imm_data_len); } else { sg_init_table(&isert_cmd->sg, 1); cmd->se_cmd.t_data_sg = &isert_cmd->sg; cmd->se_cmd.t_data_nents = 1; - sg_set_buf(&isert_cmd->sg, &rx_desc->data[0], imm_data_len); + sg_set_buf(&isert_cmd->sg, isert_get_data(rx_desc), + imm_data_len); isert_dbg("Transfer Immediate imm_data_len: %d\n", imm_data_len); } @@ -1241,9 +1242,9 @@ isert_handle_iscsi_dataout(struct isert_conn *isert_conn, } isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u " "sg_nents: %u from %p %u\n", sg_start, sg_off, - sg_nents, &rx_desc->data[0], unsol_data_len); + sg_nents, isert_get_data(rx_desc), unsol_data_len); - sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0], + sg_copy_from_buffer(sg_start, sg_nents, isert_get_data(rx_desc), unsol_data_len); rc = iscsit_check_dataout_payload(cmd, hdr, false); @@ -1302,7 +1303,7 @@ isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd } cmd->text_in_ptr = text_in; - memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length); + memcpy(cmd->text_in_ptr, isert_get_data(rx_desc), payload_length); return iscsit_process_text_cmd(conn, cmd, hdr); } @@ -1312,7 +1313,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, uint32_t read_stag, uint64_t read_va, uint32_t write_stag, uint64_t write_va) { - struct iscsi_hdr *hdr = &rx_desc->iscsi_header; + struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc); struct iscsi_conn *conn = isert_conn->conn; struct iscsi_cmd *cmd; struct isert_cmd *isert_cmd; @@ -1410,8 +1411,8 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc) struct isert_conn *isert_conn = wc->qp->qp_context; struct ib_device *ib_dev = isert_conn->cm_id->device; struct iser_rx_desc *rx_desc = cqe_to_rx_desc(wc->wr_cqe); - struct iscsi_hdr *hdr = &rx_desc->iscsi_header; - struct iser_ctrl *iser_ctrl = &rx_desc->iser_header; + struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc); + struct iser_ctrl *iser_ctrl = isert_get_iser_hdr(rx_desc); uint64_t read_va = 0, write_va = 0; uint32_t read_stag = 0, write_stag = 0; @@ -1425,7 +1426,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc) rx_desc->in_use = true; ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_SIZE, DMA_FROM_DEVICE); isert_dbg("DMA: 0x%llx, iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", rx_desc->dma_addr, hdr->opcode, hdr->itt, hdr->flags, @@ -1460,7 +1461,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc) read_stag, read_va, write_stag, write_va); ib_dma_sync_single_for_device(ib_dev, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ISER_RX_SIZE, DMA_FROM_DEVICE); } static void @@ -1474,8 +1475,8 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } - ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_req_dma, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); isert_conn->login_req_len = wc->byte_len - ISER_HEADERS_LEN; @@ -1490,8 +1491,8 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) complete(&isert_conn->login_req_comp); mutex_unlock(&isert_conn->mutex); - ib_dma_sync_single_for_device(ib_dev, isert_conn->login_req_dma, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); } static void diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 3b296bac4f60..d267a6d60d87 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -59,9 +59,11 @@ ISERT_MAX_TX_MISC_PDUS + \ ISERT_MAX_RX_MISC_PDUS) -#define ISER_RX_PAD_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \ - (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \ - sizeof(struct ib_cqe) + sizeof(bool))) +/* + * RX size is default of 8k plus headers, but data needs to align to + * 512 boundary, so use 1024 to have the extra space for alignment. + */ +#define ISER_RX_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 1024) #define ISCSI_ISER_SG_TABLESIZE 256 @@ -80,21 +82,41 @@ enum iser_conn_state { }; struct iser_rx_desc { - struct iser_ctrl iser_header; - struct iscsi_hdr iscsi_header; - char data[ISCSI_DEF_MAX_RECV_SEG_LEN]; + char buf[ISER_RX_SIZE]; u64 dma_addr; struct ib_sge rx_sg; struct ib_cqe rx_cqe; bool in_use; - char pad[ISER_RX_PAD_SIZE]; -} __packed; +}; static inline struct iser_rx_desc *cqe_to_rx_desc(struct ib_cqe *cqe) { return container_of(cqe, struct iser_rx_desc, rx_cqe); } +static void *isert_get_iser_hdr(struct iser_rx_desc *desc) +{ + return PTR_ALIGN(desc->buf + ISER_HEADERS_LEN, 512) - ISER_HEADERS_LEN; +} + +static size_t isert_get_hdr_offset(struct iser_rx_desc *desc) +{ + return isert_get_iser_hdr(desc) - (void *)desc->buf; +} + +static void *isert_get_iscsi_hdr(struct iser_rx_desc *desc) +{ + return isert_get_iser_hdr(desc) + sizeof(struct iser_ctrl); +} + +static void *isert_get_data(struct iser_rx_desc *desc) +{ + void *data = isert_get_iser_hdr(desc) + ISER_HEADERS_LEN; + + WARN_ON((uintptr_t)data & 511); + return data; +} + struct iser_tx_desc { struct iser_ctrl iser_header; struct iscsi_hdr iscsi_header; @@ -141,9 +163,8 @@ struct isert_conn { u32 responder_resources; u32 initiator_depth; bool pi_support; - struct iser_rx_desc *login_req_buf; + struct iser_rx_desc *login_desc; char *login_rsp_buf; - u64 login_req_dma; int login_req_len; u64 login_rsp_dma; struct iser_rx_desc *rx_descs; From 7c452e5f9fbfcb710c6154d54d70688b4dc63431 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 13 Sep 2022 18:20:24 +0530 Subject: [PATCH 52/54] devcoredump : Serialize devcd_del work [ Upstream commit 01daccf748323dfc61112f474cf2ba81015446b0 ] In following scenario(diagram), when one thread X running dev_coredumpm() adds devcd device to the framework which sends uevent notification to userspace and another thread Y reads this uevent and call to devcd_data_write() which eventually try to delete the queued timer that is not initialized/queued yet. So, debug object reports some warning and in the meantime, timer is initialized and queued from X path. and from Y path, it gets reinitialized again and timer->entry.pprev=NULL and try_to_grab_pending() stucks. To fix this, introduce mutex and a boolean flag to serialize the behaviour. cpu0(X) cpu1(Y) dev_coredump() uevent sent to user space device_add() ======================> user space process Y reads the uevents writes to devcd fd which results into writes to devcd_data_write() mod_delayed_work() try_to_grab_pending() del_timer() debug_assert_init() INIT_DELAYED_WORK() schedule_delayed_work() debug_object_fixup() timer_fixup_assert_init() timer_setup() do_init_timer() /* Above call reinitializes the timer to timer->entry.pprev=NULL and this will be checked later in timer_pending() call. */ timer_pending() !hlist_unhashed_lockless(&timer->entry) !h->pprev /* del_timer() checks h->pprev and finds it to be NULL due to which try_to_grab_pending() stucks. */ Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/ Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/1663073424-13663-1-git-send-email-quic_mojha@quicinc.com Signed-off-by: Greg Kroah-Hartman Stable-dep-of: af54d778a038 ("devcoredump: Send uevent once devcd is ready") Signed-off-by: Sasha Levin --- drivers/base/devcoredump.c | 83 +++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index f1a3353f3494..1ba4af7f2f21 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -29,6 +29,47 @@ struct devcd_entry { struct device devcd_dev; void *data; size_t datalen; + /* + * Here, mutex is required to serialize the calls to del_wk work between + * user/kernel space which happens when devcd is added with device_add() + * and that sends uevent to user space. User space reads the uevents, + * and calls to devcd_data_write() which try to modify the work which is + * not even initialized/queued from devcoredump. + * + * + * + * cpu0(X) cpu1(Y) + * + * dev_coredump() uevent sent to user space + * device_add() ======================> user space process Y reads the + * uevents writes to devcd fd + * which results into writes to + * + * devcd_data_write() + * mod_delayed_work() + * try_to_grab_pending() + * del_timer() + * debug_assert_init() + * INIT_DELAYED_WORK() + * schedule_delayed_work() + * + * + * Also, mutex alone would not be enough to avoid scheduling of + * del_wk work after it get flush from a call to devcd_free() + * mentioned as below. + * + * disabled_store() + * devcd_free() + * mutex_lock() devcd_data_write() + * flush_delayed_work() + * mutex_unlock() + * mutex_lock() + * mod_delayed_work() + * mutex_unlock() + * So, delete_work flag is required. + */ + struct mutex mutex; + bool delete_work; struct module *owner; ssize_t (*read)(char *buffer, loff_t offset, size_t count, void *data, size_t datalen); @@ -88,7 +129,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct devcd_entry *devcd = dev_to_devcd(dev); - mod_delayed_work(system_wq, &devcd->del_wk, 0); + mutex_lock(&devcd->mutex); + if (!devcd->delete_work) { + devcd->delete_work = true; + mod_delayed_work(system_wq, &devcd->del_wk, 0); + } + mutex_unlock(&devcd->mutex); return count; } @@ -116,7 +162,12 @@ static int devcd_free(struct device *dev, void *data) { struct devcd_entry *devcd = dev_to_devcd(dev); + mutex_lock(&devcd->mutex); + if (!devcd->delete_work) + devcd->delete_work = true; + flush_delayed_work(&devcd->del_wk); + mutex_unlock(&devcd->mutex); return 0; } @@ -126,6 +177,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr, return sprintf(buf, "%d\n", devcd_disabled); } +/* + * + * disabled_store() worker() + * class_for_each_device(&devcd_class, + * NULL, NULL, devcd_free) + * ... + * ... + * while ((dev = class_dev_iter_next(&iter)) + * devcd_del() + * device_del() + * put_device() <- last reference + * error = fn(dev, data) devcd_dev_release() + * devcd_free(dev, data) kfree(devcd) + * mutex_lock(&devcd->mutex); + * + * + * In the above diagram, It looks like disabled_store() would be racing with parallely + * running devcd_del() and result in memory abort while acquiring devcd->mutex which + * is called after kfree of devcd memory after dropping its last reference with + * put_device(). However, this will not happens as fn(dev, data) runs + * with its own reference to device via klist_node so it is not its last reference. + * so, above situation would not occur. + */ + static ssize_t disabled_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) { @@ -291,13 +366,16 @@ void dev_coredumpm(struct device *dev, struct module *owner, devcd->read = read; devcd->free = free; devcd->failing_dev = get_device(dev); + devcd->delete_work = false; + mutex_init(&devcd->mutex); device_initialize(&devcd->devcd_dev); dev_set_name(&devcd->devcd_dev, "devcd%d", atomic_inc_return(&devcd_count)); devcd->devcd_dev.class = &devcd_class; + mutex_lock(&devcd->mutex); if (device_add(&devcd->devcd_dev)) goto put_device; @@ -311,10 +389,11 @@ void dev_coredumpm(struct device *dev, struct module *owner, INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT); - + mutex_unlock(&devcd->mutex); return; put_device: put_device(&devcd->devcd_dev); + mutex_unlock(&devcd->mutex); put_module: module_put(owner); free: From 49d11d329a92f348d3e6bcac9ebfe05d785f1bb1 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Fri, 17 Nov 2023 20:19:32 +0530 Subject: [PATCH 53/54] devcoredump: Send uevent once devcd is ready [ Upstream commit af54d778a03853801d681c98c0c2a6c316ef9ca7 ] dev_coredumpm() creates a devcoredump device and adds it to the core kernel framework which eventually end up sending uevent to the user space and later creates a symbolic link to the failed device. An application running in userspace may be interested in this symbolic link to get the name of the failed device. In a issue scenario, once uevent sent to the user space it start reading '/sys/class/devcoredump/devcdX/failing_device' to get the actual name of the device which might not been created and it is in its path of creation. To fix this, suppress sending uevent till the failing device symbolic link gets created and send uevent once symbolic link is created successfully. Fixes: 833c95456a70 ("device coredump: add new device coredump class") Signed-off-by: Mukesh Ojha Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1700232572-25823-1-git-send-email-quic_mojha@quicinc.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/base/devcoredump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index 1ba4af7f2f21..b13574cdef69 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -376,6 +376,7 @@ void dev_coredumpm(struct device *dev, struct module *owner, devcd->devcd_dev.class = &devcd_class; mutex_lock(&devcd->mutex); + dev_set_uevent_suppress(&devcd->devcd_dev, true); if (device_add(&devcd->devcd_dev)) goto put_device; @@ -387,6 +388,8 @@ void dev_coredumpm(struct device *dev, struct module *owner, "devcoredump")) /* nothing - symlink will be missing */; + dev_set_uevent_suppress(&devcd->devcd_dev, false); + kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD); INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT); mutex_unlock(&devcd->mutex); From f93c1f58eb68bada8c86088104efe14cfe735957 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Dec 2023 17:42:21 +0100 Subject: [PATCH 54/54] Linux 4.19.302 Link: https://lore.kernel.org/r/20231211182012.263036284@linuxfoundation.org Tested-by: Shuah Khan Link: https://lore.kernel.org/r/20231212120154.063773918@linuxfoundation.org Tested-by: Pavel Machek (CIP) Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1db751784cf2..acd4be908046 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 19 -SUBLEVEL = 301 +SUBLEVEL = 302 EXTRAVERSION = NAME = "People's Front"