From: Linus Torvalds Date: Fri, 22 Jul 2011 21:43:13 +0000 (-0700) Subject: Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next X-Git-Tag: v3.1-rc1~316 X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=commitdiff_plain;h=951cc93a7493a81a47e20231441bc6cf17c98a37;hp=415b3334a21aa67806c52d1acf4e72e14f7f402f Merge git://git./linux/kernel/git/davem/net-next * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1287 commits) icmp: Fix regression in nexthop resolution during replies. net: Fix ppc64 BPF JIT dependencies. acenic: include NET_SKB_PAD headroom to incoming skbs ixgbe: convert to ndo_fix_features ixgbe: only enable WoL for magic packet by default ixgbe: remove ifdef check for non-existent define ixgbe: Pass staterr instead of re-reading status and error bits from descriptor ixgbe: Move interrupt related values out of ring and into q_vector ixgbe: add structure for containing RX/TX rings to q_vector ixgbe: inline the ixgbe_maybe_stop_tx function ixgbe: Update ATR to use recorded TX queues instead of CPU for routing igb: Fix for DH89xxCC near end loopback test e1000: always call e1000_check_for_link() on e1000_ce4100 MACs. netxen: add fw version compatibility check be2net: request native mode each time the card is reset ipv4: Constrain UFO fragment sizes to multiples of 8 bytes virtio_net: Fix panic in virtnet_remove ipv6: make fragment identifications less predictable ipv6: unshare inetpeers can: make function can_get_bittiming static ... --- diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index 8e4fab639d9c..a0a61d2f389f 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -111,34 +111,6 @@ The following is an example of the kernel boot arguments to attach mtd0 to UBI and mount volume "rootfs": ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs - -Module Parameters for Debugging -=============================== - -When UBIFS has been compiled with debugging enabled, there are 2 module -parameters that are available to control aspects of testing and debugging. - -debug_chks Selects extra checks that UBIFS can do while running: - - Check Flag value - - General checks 1 - Check Tree Node Cache (TNC) 2 - Check indexing tree size 4 - Check orphan area 8 - Check old indexing tree 16 - Check LEB properties (lprops) 32 - Check leaf nodes and inodes 64 - -debug_tsts Selects a mode of testing, as follows: - - Test mode Flag value - - Failure mode for recovery testing 4 - -For example, set debug_chks to 3 to enable general and TNC checks. - - References ========== diff --git a/Documentation/mmc/00-INDEX b/Documentation/mmc/00-INDEX index 93dd7a714075..a9ba6720ffdf 100644 --- a/Documentation/mmc/00-INDEX +++ b/Documentation/mmc/00-INDEX @@ -4,3 +4,5 @@ mmc-dev-attrs.txt - info on SD and MMC device attributes mmc-dev-parts.txt - info on SD and MMC device partitions +mmc-async-req.txt + - info on mmc asynchronous requests diff --git a/Documentation/mmc/mmc-async-req.txt b/Documentation/mmc/mmc-async-req.txt new file mode 100644 index 000000000000..ae1907b10e4a --- /dev/null +++ b/Documentation/mmc/mmc-async-req.txt @@ -0,0 +1,87 @@ +Rationale +========= + +How significant is the cache maintenance overhead? +It depends. Fast eMMC and multiple cache levels with speculative cache +pre-fetch makes the cache overhead relatively significant. If the DMA +preparations for the next request are done in parallel with the current +transfer, the DMA preparation overhead would not affect the MMC performance. +The intention of non-blocking (asynchronous) MMC requests is to minimize the +time between when an MMC request ends and another MMC request begins. +Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and +dma_unmap_sg are processing. Using non-blocking MMC requests makes it +possible to prepare the caches for next job in parallel with an active +MMC request. + +MMC block driver +================ + +The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking. +The increase in throughput is proportional to the time it takes to +prepare (major part of preparations are dma_map_sg() and dma_unmap_sg()) +a request and how fast the memory is. The faster the MMC/SD is the +more significant the prepare request time becomes. Roughly the expected +performance gain is 5% for large writes and 10% on large reads on a L2 cache +platform. In power save mode, when clocks run on a lower frequency, the DMA +preparation may cost even more. As long as these slower preparations are run +in parallel with the transfer performance won't be affected. + +Details on measurements from IOZone and mmc_test +================================================ + +https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req + +MMC core API extension +====================== + +There is one new public function mmc_start_req(). +It starts a new MMC command request for a host. The function isn't +truly non-blocking. If there is an ongoing async request it waits +for completion of that request and starts the new one and returns. It +doesn't wait for the new request to complete. If there is no ongoing +request it starts the new request and returns immediately. + +MMC host extensions +=================== + +There are two optional members in the mmc_host_ops -- pre_req() and +post_req() -- that the host driver may implement in order to move work +to before and after the actual mmc_host_ops.request() function is called. +In the DMA case pre_req() may do dma_map_sg() and prepare the DMA +descriptor, and post_req() runs the dma_unmap_sg(). + +Optimize for the first request +============================== + +The first request in a series of requests can't be prepared in parallel +with the previous transfer, since there is no previous request. +The argument is_first_req in pre_req() indicates that there is no previous +request. The host driver may optimize for this scenario to minimize +the performance loss. A way to optimize for this is to split the current +request in two chunks, prepare the first chunk and start the request, +and finally prepare the second chunk and start the transfer. + +Pseudocode to handle is_first_req scenario with minimal prepare overhead: + +if (is_first_req && req->size > threshold) + /* start MMC transfer for the complete transfer size */ + mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE); + + /* + * Begin to prepare DMA while cmd is being processed by MMC. + * The first chunk of the request should take the same time + * to prepare as the "MMC process command time". + * If prepare time exceeds MMC cmd time + * the transfer is delayed, guesstimate max 4k as first chunk size. + */ + prepare_1st_chunk_for_dma(req); + /* flush pending desc to the DMAC (dmaengine.h) */ + dma_issue_pending(req->dma_desc); + + prepare_2nd_chunk_for_dma(req); + /* + * The second issue_pending should be called before MMC runs out + * of the first chunk. If the MMC runs out of the first data chunk + * before this call, the transfer is delayed. + */ + dma_issue_pending(req->dma_desc); diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index cd9d6af61d07..043bd7df3139 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -51,7 +51,7 @@ #include #include "../../../include/linux/lguest_launcher.h" /*L:110 - * We can ignore the 42 include files we need for this program, but I do want + * We can ignore the 43 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -65,7 +65,6 @@ typedef uint16_t u16; typedef uint8_t u8; /*:*/ -#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq) /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); + if (len <= 0) { + warn("Write to stdout gave %i (%d)", len, errno); + break; + } iov_consume(iov, out, len); } @@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq) * same format: what a coincidence! */ if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); + warnx("Write to tun failed (%d)?", errno); /* * Done with that one; wait_for_vq_desc() will send the interrupt if @@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq) */ len = readv(net_info->tunfd, iov, in); if (len <= 0) - err(1, "Failed to read from tun."); + warn("Failed to read from tun (%d).", errno); /* * Mark that packet buffer as used, but don't interrupt here. We want @@ -1093,9 +1094,10 @@ static void update_device_status(struct device *dev) warnx("Device %s configuration FAILED", dev->name); if (dev->running) reset_device(dev); - } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); + } else { + if (dev->running) + err(1, "Device %s features finalized twice", dev->name); + start_device(dev); } } @@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr) return; } - /* - * Devices *can* be used before status is set to DRIVER_OK. - * The original plan was that they would never do this: they - * would always finish setting up their status bits before - * actually touching the virtqueues. In practice, we allowed - * them to, and they do (eg. the disk probes for partition - * tables as part of initialization). - * - * If we see this, we start the device: once it's running, we - * expect the device to catch all the notifications. - */ + /* Devices should not be used before features are finalized. */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - /* This just calls create_thread() for each virtqueue */ - start_device(i); - return; + errx(1, "Notification on %s before setup!", i->name); } } @@ -1370,7 +1358,7 @@ static void setup_console(void) * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * - * More sopisticated is to use one of the tools developed for project like UML + * More sophisticated is to use one of the tools developed for project like UML * to do networking. * * Faster is to do virtio bonding in kernel. Doing this 1:1 would be @@ -1380,7 +1368,7 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. + * Finally, we could use a virtio network switch in the kernel, ie. vhost. :*/ static u32 str2ip(const char *ipaddr) @@ -2017,10 +2005,7 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* - * We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. - */ + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); /* Ensure that we terminate if a device-servicing child dies. */ diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 9b7221a86df2..7c3a8801b7ce 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -674,7 +674,7 @@ Protocol: 2.10+ Field name: init_size Type: read -Offset/size: 0x25c/4 +Offset/size: 0x260/4 This field indicates the amount of linear contiguous memory starting at the kernel runtime start address that the kernel needs before it diff --git a/MAINTAINERS b/MAINTAINERS index 7a9569e06b0a..81cf5fb615e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1,4 +1,5 @@ + List of maintainers and how to submit kernel changes Please try to follow the guidelines below. This will make things @@ -4584,9 +4585,8 @@ S: Maintained F: drivers/mmc/host/omap.c OMAP HS MMC SUPPORT -M: Madhusudhan Chikkature L: linux-omap@vger.kernel.org -S: Maintained +S: Orphan F: drivers/mmc/host/omap_hsmmc.c OMAP RANDOM NUMBER GENERATOR SUPPORT @@ -6242,9 +6242,14 @@ F: drivers/char/toshiba.c F: include/linux/toshiba.h TMIO MMC DRIVER +M: Guennadi Liakhovetski M: Ian Molton +L: linux-mmc@vger.kernel.org S: Maintained -F: drivers/mmc/host/tmio_mmc.* +F: drivers/mmc/host/tmio_mmc* +F: drivers/mmc/host/sh_mobile_sdhi.c +F: include/linux/mmc/tmio.h +F: include/linux/mmc/sh_mobile_sdhi.h TMPFS (SHMEM FILESYSTEM) M: Hugh Dickins @@ -6321,7 +6326,7 @@ F: drivers/scsi/u14-34f.c UBI FILE SYSTEM (UBIFS) M: Artem Bityutskiy -M: Adrian Hunter +M: Adrian Hunter L: linux-mtd@lists.infradead.org T: git git://git.infradead.org/ubifs-2.6.git W: http://www.linux-mtd.infradead.org/doc/ubifs.html diff --git a/Makefile b/Makefile index 60d91f76c2fd..6a5bdad524af 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Sneaky Weasel # *DOCUMENTATION* diff --git a/arch/arm/configs/mmp2_defconfig b/arch/arm/configs/mmp2_defconfig index 47ad3b1a4fee..5a584520db2f 100644 --- a/arch/arm/configs/mmp2_defconfig +++ b/arch/arm/configs/mmp2_defconfig @@ -8,6 +8,7 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_ARCH_MMP=y +CONFIG_MACH_BROWNSTONE=y CONFIG_MACH_FLINT=y CONFIG_MACH_MARVELL_JASPER=y CONFIG_HIGH_RES_TIMERS=y @@ -63,10 +64,16 @@ CONFIG_BACKLIGHT_MAX8925=y # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_MAX8925=y +CONFIG_MMC=y # CONFIG_DNOTIFY is not set CONFIG_INOTIFY=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +CONFIG_EXT4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_FAT_DEFAULT_CODEPAGE=437 CONFIG_JFFS2_FS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y @@ -81,7 +88,7 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_PREEMPT is not set CONFIG_DEBUG_INFO=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_DYNAMIC_DEBUG=y +# CONFIG_DYNAMIC_DEBUG is not set CONFIG_DEBUG_USER=y CONFIG_DEBUG_ERRORS=y # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/arm/mach-mmp/brownstone.c b/arch/arm/mach-mmp/brownstone.c index 7bb78fd5a2a6..c79162a50f28 100644 --- a/arch/arm/mach-mmp/brownstone.c +++ b/arch/arm/mach-mmp/brownstone.c @@ -177,9 +177,16 @@ static struct i2c_board_info brownstone_twsi1_info[] = { }; static struct sdhci_pxa_platdata mmp2_sdh_platdata_mmc0 = { - .max_speed = 25000000, + .clk_delay_cycles = 0x1f, }; +static struct sdhci_pxa_platdata mmp2_sdh_platdata_mmc2 = { + .clk_delay_cycles = 0x1f, + .flags = PXA_FLAG_CARD_PERMANENT + | PXA_FLAG_SD_8_BIT_CAPABLE_SLOT, +}; + + static void __init brownstone_init(void) { mfp_config(ARRAY_AND_SIZE(brownstone_pin_config)); @@ -189,6 +196,7 @@ static void __init brownstone_init(void) mmp2_add_uart(3); mmp2_add_twsi(1, NULL, ARRAY_AND_SIZE(brownstone_twsi1_info)); mmp2_add_sdhost(0, &mmp2_sdh_platdata_mmc0); /* SD/MMC */ + mmp2_add_sdhost(2, &mmp2_sdh_platdata_mmc2); /* eMMC */ /* enable 5v regulator */ platform_device_register(&brownstone_v_5vp_device); diff --git a/arch/arm/mach-mmp/include/mach/mmp2.h b/arch/arm/mach-mmp/include/mach/mmp2.h index 2cbf6df09b82..de7b88826ad7 100644 --- a/arch/arm/mach-mmp/include/mach/mmp2.h +++ b/arch/arm/mach-mmp/include/mach/mmp2.h @@ -1,7 +1,7 @@ #ifndef __ASM_MACH_MMP2_H #define __ASM_MACH_MMP2_H -#include +#include struct sys_timer; diff --git a/arch/arm/mach-mmp/jasper.c b/arch/arm/mach-mmp/jasper.c index 24172a0aad59..5d6421d63254 100644 --- a/arch/arm/mach-mmp/jasper.c +++ b/arch/arm/mach-mmp/jasper.c @@ -154,7 +154,7 @@ static struct i2c_board_info jasper_twsi1_info[] = { }; static struct sdhci_pxa_platdata mmp2_sdh_platdata_mmc0 = { - .max_speed = 25000000, + .clk_delay_cycles = 0x1f, }; static void __init jasper_init(void) diff --git a/arch/arm/mach-mmp/mmp2.c b/arch/arm/mach-mmp/mmp2.c index 8e6c3ac7f7c1..079c18861d5c 100644 --- a/arch/arm/mach-mmp/mmp2.c +++ b/arch/arm/mach-mmp/mmp2.c @@ -168,10 +168,10 @@ static struct clk_lookup mmp2_clkregs[] = { INIT_CLKREG(&clk_twsi5, "pxa2xx-i2c.4", NULL), INIT_CLKREG(&clk_twsi6, "pxa2xx-i2c.5", NULL), INIT_CLKREG(&clk_nand, "pxa3xx-nand", NULL), - INIT_CLKREG(&clk_sdh0, "sdhci-pxa.0", "PXA-SDHCLK"), - INIT_CLKREG(&clk_sdh1, "sdhci-pxa.1", "PXA-SDHCLK"), - INIT_CLKREG(&clk_sdh2, "sdhci-pxa.2", "PXA-SDHCLK"), - INIT_CLKREG(&clk_sdh3, "sdhci-pxa.3", "PXA-SDHCLK"), + INIT_CLKREG(&clk_sdh0, "sdhci-pxav3.0", "PXA-SDHCLK"), + INIT_CLKREG(&clk_sdh1, "sdhci-pxav3.1", "PXA-SDHCLK"), + INIT_CLKREG(&clk_sdh2, "sdhci-pxav3.2", "PXA-SDHCLK"), + INIT_CLKREG(&clk_sdh3, "sdhci-pxav3.3", "PXA-SDHCLK"), }; static int __init mmp2_init(void) @@ -222,8 +222,8 @@ MMP2_DEVICE(twsi4, "pxa2xx-i2c", 3, TWSI4, 0xd4033000, 0x70); MMP2_DEVICE(twsi5, "pxa2xx-i2c", 4, TWSI5, 0xd4033800, 0x70); MMP2_DEVICE(twsi6, "pxa2xx-i2c", 5, TWSI6, 0xd4034000, 0x70); MMP2_DEVICE(nand, "pxa3xx-nand", -1, NAND, 0xd4283000, 0x100, 28, 29); -MMP2_DEVICE(sdh0, "sdhci-pxa", 0, MMC, 0xd4280000, 0x120); -MMP2_DEVICE(sdh1, "sdhci-pxa", 1, MMC2, 0xd4280800, 0x120); -MMP2_DEVICE(sdh2, "sdhci-pxa", 2, MMC3, 0xd4281000, 0x120); -MMP2_DEVICE(sdh3, "sdhci-pxa", 3, MMC4, 0xd4281800, 0x120); +MMP2_DEVICE(sdh0, "sdhci-pxav3", 0, MMC, 0xd4280000, 0x120); +MMP2_DEVICE(sdh1, "sdhci-pxav3", 1, MMC2, 0xd4280800, 0x120); +MMP2_DEVICE(sdh2, "sdhci-pxav3", 2, MMC3, 0xd4281000, 0x120); +MMP2_DEVICE(sdh3, "sdhci-pxav3", 3, MMC4, 0xd4281800, 0x120); diff --git a/arch/arm/plat-pxa/include/plat/sdhci.h b/arch/arm/plat-pxa/include/plat/sdhci.h deleted file mode 100644 index 1ab332e37d7d..000000000000 --- a/arch/arm/plat-pxa/include/plat/sdhci.h +++ /dev/null @@ -1,35 +0,0 @@ -/* linux/arch/arm/plat-pxa/include/plat/sdhci.h - * - * Copyright 2010 Marvell - * Zhangfei Gao - * - * PXA Platform - SDHCI platform data definitions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __PLAT_PXA_SDHCI_H -#define __PLAT_PXA_SDHCI_H - -/* pxa specific flag */ -/* Require clock free running */ -#define PXA_FLAG_DISABLE_CLOCK_GATING (1<<0) - -/* Board design supports 8-bit data on SD/SDIO BUS */ -#define PXA_FLAG_SD_8_BIT_CAPABLE_SLOT (1<<2) - -/* - * struct pxa_sdhci_platdata() - Platform device data for PXA SDHCI - * @max_speed: the maximum speed supported - * @quirks: quirks of specific device - * @flags: flags for platform requirement - */ -struct sdhci_pxa_platdata { - unsigned int max_speed; - unsigned int quirks; - unsigned int flags; -}; - -#endif /* __PLAT_PXA_SDHCI_H */ diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index c7ad3fe2b252..b928b31424b1 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -205,6 +205,7 @@ do { current_thread_info()->syscall_noerror = 1; \ } while (0) #define user_mode(regs) (!((regs)->tstate & TSTATE_PRIV)) #define instruction_pointer(regs) ((regs)->tpc) +#define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) #define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) #ifdef CONFIG_SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da349723d411..37357a599dca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1170,7 +1170,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" - depends on NUMA && PCI + depends on X86_64 && NUMA && PCI ---help--- Enable AMD NUMA node topology detection. You should say Y here if you have a multi processor AMD system. This uses an old method to diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f2924c413..879fd7d33877 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -61,6 +61,7 @@ hcall(unsigned long call, : "memory"); return call; } +/*:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 4fbda9a3f339..968d57dd54c9 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,13 +14,14 @@ static inline int pci_xen_hvm_init(void) } #endif #if defined(CONFIG_XEN_DOM0) -void __init xen_setup_pirqs(void); +int __init pci_xen_initial_domain(void); int xen_find_device_domain_owner(struct pci_dev *dev); int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); int xen_unregister_device_domain_owner(struct pci_dev *dev); #else -static inline void __init xen_setup_pirqs(void) +static inline int __init pci_xen_initial_domain(void) { + return -1; } static inline int xen_find_device_domain_owner(struct pci_dev *dev) { diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index c29d631af6fc..395a10e68067 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -63,7 +63,6 @@ void foo(void) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4f0d46fefa7f..9242436e9937 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -419,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, { } }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index db832fd65ecb..13ee258442ae 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -71,7 +71,8 @@ #include #include /* for struct machine_ops */ -/*G:010 Welcome to the Guest! +/*G:010 + * Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the @@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, #endif /*G:036 - * When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. -:*/ + * When lazy mode is turned off, we issue the do-nothing hypercall to + * flush any stored calls, and call the generic helper to reset the + * per-cpu lazy mode variable. + */ static void lguest_leave_lazy_mmu_mode(void) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } +/* + * We also catch the end of context switch; we enter lazy mode for much of + * that too, so again we need to flush here. + * + * (Technically, this is lazy CPU mode, and normally we're in lazy MMU + * mode, but unlike Xen, lguest doesn't care about the difference). + */ static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); @@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void) * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 5 languages. I am not making this up! + * has been translated into 6 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, /* * PAE systems can mark pages as non-executable. Linux calls this the * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch turn if off here, since we don't + * Virus Protection). We just switch it off here, since we don't * support it. */ case 0x80000001: @@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; +static unsigned long current_cr3; /* * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. The only - * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. + * cr0. Keep a local copy, and tell the Host when it changes. */ static void lguest_write_cr3(unsigned long cr3) { - lguest_data.pgdir = cr3; lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); + current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) @@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3) static unsigned long lguest_read_cr3(void) { - return lguest_data.pgdir; + return current_cr3; } /* cr4 is used to enable and disable PGE, but we don't care. */ @@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) /* * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. Wetell the Host the toplevel and + * a page into a process' address space. We tell the Host the toplevel and * address this corresponds to. The Guest uses one pagetable per process, so * we need to tell the Host which one we're changing (mm->pgd). */ @@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); + lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); } /* @@ -1140,7 +1148,7 @@ static struct notifier_block paniced = { static __init char *lguest_memory_setup(void) { /* - *The Linux bootloader header contains an "e820" memory map: the + * The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ e820_add_region(boot_params.e820_map[0].addr, diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d55..6ddfe4fc23c3 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -6,18 +6,22 @@ #include /*G:020 - * Our story starts with the kernel booting into startup_32 in - * arch/x86/kernel/head_32.S. It expects a boot header, which is created by - * the bootloader (the Launcher in our case). + + * Our story starts with the bzImage: booting starts at startup_32 in + * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real + * kernel in place and then jumps into it: startup_32 in + * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi + * register, which is created by the bootloader (the Launcher in our case). * * The startup_32 function does very little: it clears the uninitialized global * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe. Finally it checks the - * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: - * if it's set to '1' (lguest's assigned number), then it calls us here. + * header and kernel command line somewhere safe, and populates some initial + * page tables. Finally it checks the 'hardware_subarch' field. This was + * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's + * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expectes + * addresses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * @@ -27,13 +31,18 @@ .section .init.text, "ax", @progbits ENTRY(lguest_entry) /* - * We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. + * We make the "initialization" hypercall now to tell the Host where + * our lguest_data struct is. */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx int $LGUEST_TRAP_ENTRY + /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ + movl $LHCALL_NEW_PGTABLE, %eax + movl $(initial_page_table - __PAGE_OFFSET), %ebx + int $LGUEST_TRAP_ENTRY + /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp @@ -96,12 +105,8 @@ send_interrupts: */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* - * This is a vmcall instruction (same thing that KVM uses). Older - * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. - */ - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* This is the actual hypercall trap. */ + int $LGUEST_TRAP_ENTRY /* Put eax back the way we found it. */ popl %eax ret diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index f567965c0620..1017c7bee388 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -1,8 +1,13 @@ /* - * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux - * x86 PCI core to support the Xen PCI Frontend + * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and + * initial domain support. We also handle the DSDT _PRT callbacks for GSI's + * used in HVM and initial domain mode (PV does not parse ACPI, so it has no + * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and + * 0xcf8 PCI configuration read/write. * * Author: Ryan Wilson + * Konrad Rzeszutek Wilk + * Stefano Stabellini */ #include #include @@ -19,22 +24,53 @@ #include #include +static int xen_pcifront_enable_irq(struct pci_dev *dev) +{ + int rc; + int share = 1; + int pirq; + u8 gsi; + + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); + return rc; + } + /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/ + pirq = gsi; + + if (gsi < NR_IRQS_LEGACY) + share = 0; + + rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", + gsi, pirq, rc); + return rc; + } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); + return 0; +} + #ifdef CONFIG_ACPI -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) +static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, + bool set_pirq) { - int rc, irq; + int rc, pirq = -1, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; - if (!xen_hvm_domain()) - return -1; + if (set_pirq) + pirq = gsi; map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; - map_irq.pirq = -1; + map_irq.pirq = pirq; rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { @@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, return -1; } - if (trigger == ACPI_EDGE_SENSITIVE) { + if (triggering == ACPI_EDGE_SENSITIVE) { shareable = 0; name = "ioapic-edge"; } else { @@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, name = "ioapic-level"; } + if (gsi_override >= 0) + gsi = gsi_override; + irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); + if (irq < 0) + goto out; + + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi); +out: + return irq; +} + +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + if (!xen_hvm_domain()) + return -1; - printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + return xen_register_pirq(gsi, -1 /* no GSI override */, trigger, + false /* no mapping of GSI to PIRQ */); +} + +#ifdef CONFIG_XEN_DOM0 +static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) +{ + int rc, irq; + struct physdev_setup_gsi setup_gsi; + + if (!xen_pv_domain()) + return -1; + + printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", + gsi, triggering, polarity); + + irq = xen_register_pirq(gsi, gsi_override, triggering, true); + + setup_gsi.gsi = gsi; + setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); + setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (rc == -EEXIST) + printk(KERN_INFO "Already setup the GSI :%d\n", gsi); + else if (rc) { + printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", + gsi, rc); + } return irq; } + +static int acpi_register_gsi_xen(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); +} +#endif #endif #if defined(CONFIG_PCI_MSI) @@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); +static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret, i; + struct msi_desc *msidesc; + int *v; + + v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); + if (!v) + return -ENOMEM; + + if (type == PCI_CAP_ID_MSIX) + ret = xen_pci_frontend_enable_msix(dev, v, nvec); + else + ret = xen_pci_frontend_enable_msi(dev, v); + if (ret) + goto error; + i = 0; + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi", + DOMID_SELF); + if (irq < 0) + goto free; + i++; + } + kfree(v); + return 0; + +error: + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); +free: + kfree(v); + return ret; +} + #define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) @@ -123,67 +247,6 @@ error: return -ENODEV; } -/* - * For MSI interrupts we have to use drivers/xen/event.s functions to - * allocate an irq_desc and setup the right */ - - -static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - int irq, ret, i; - struct msi_desc *msidesc; - int *v; - - v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); - if (!v) - return -ENOMEM; - - if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, v, nvec); - else - ret = xen_pci_frontend_enable_msi(dev, v); - if (ret) - goto error; - i = 0; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : - "pcifront-msi", - DOMID_SELF); - if (irq < 0) - goto free; - i++; - } - kfree(v); - return 0; - -error: - dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); -free: - kfree(v); - return ret; -} - -static void xen_teardown_msi_irqs(struct pci_dev *dev) -{ - struct msi_desc *msidesc; - - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); - if (msidesc->msi_attrib.is_msix) - xen_pci_frontend_disable_msix(dev); - else - xen_pci_frontend_disable_msi(dev); - - /* Free the IRQ's and the msidesc using the generic code. */ - default_teardown_msi_irqs(dev); -} - -static void xen_teardown_msi_irq(unsigned int irq) -{ - xen_destroy_irq(irq); -} - #ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { @@ -242,45 +305,28 @@ out: return ret; } #endif -#endif -static int xen_pcifront_enable_irq(struct pci_dev *dev) +static void xen_teardown_msi_irqs(struct pci_dev *dev) { - int rc; - int share = 1; - int pirq; - u8 gsi; - - rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", - rc); - return rc; - } - - rc = xen_allocate_pirq_gsi(gsi); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", - gsi, rc); - return rc; - } - pirq = rc; + struct msi_desc *msidesc; - if (gsi < NR_IRQS_LEGACY) - share = 0; + msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); + if (msidesc->msi_attrib.is_msix) + xen_pci_frontend_disable_msix(dev); + else + xen_pci_frontend_disable_msi(dev); - rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", - gsi, pirq, rc); - return rc; - } + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); +} - dev->irq = rc; - dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); - return 0; +static void xen_teardown_msi_irq(unsigned int irq) +{ + xen_destroy_irq(irq); } +#endif + int __init pci_xen_init(void) { if (!xen_pv_domain() || xen_initial_domain()) @@ -327,79 +373,6 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 -static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) -{ - int rc, pirq, irq = -1; - struct physdev_map_pirq map_irq; - int shareable = 0; - char *name; - - if (!xen_pv_domain()) - return -1; - - if (triggering == ACPI_EDGE_SENSITIVE) { - shareable = 0; - name = "ioapic-edge"; - } else { - shareable = 1; - name = "ioapic-level"; - } - pirq = xen_allocate_pirq_gsi(gsi); - if (pirq < 0) - goto out; - - if (gsi_override >= 0) - irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name); - else - irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); - if (irq < 0) - goto out; - - printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); - - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_GSI; - map_irq.index = gsi; - map_irq.pirq = pirq; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - return -1; - } - -out: - return irq; -} - -static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) -{ - int rc, irq; - struct physdev_setup_gsi setup_gsi; - - if (!xen_pv_domain()) - return -1; - - printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", - gsi, triggering, polarity); - - irq = xen_register_pirq(gsi, gsi_override, triggering); - - setup_gsi.gsi = gsi; - setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); - setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); - if (rc == -EEXIST) - printk(KERN_INFO "Already setup the GSI :%d\n", gsi); - else if (rc) { - printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", - gsi, rc); - } - - return irq; -} - static __init void xen_setup_acpi_sci(void) { int rc; @@ -419,7 +392,7 @@ static __init void xen_setup_acpi_sci(void) } trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; - + printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " "polarity=%d\n", gsi, trigger, polarity); @@ -434,10 +407,9 @@ static __init void xen_setup_acpi_sci(void) * the ACPI interpreter and keels over since IRQ 9 has not been * setup as we had setup IRQ 20 for it). */ - /* Check whether the GSI != IRQ */ if (acpi_gsi_to_irq(gsi, &irq) == 0) { - if (irq >= 0 && irq != gsi) - /* Bugger, we MUST have that IRQ. */ + /* Use the provided value if it's valid. */ + if (irq >= 0) gsi_override = irq; } @@ -447,41 +419,16 @@ static __init void xen_setup_acpi_sci(void) return; } -static int acpi_register_gsi_xen(struct device *dev, u32 gsi, - int trigger, int polarity) +int __init pci_xen_initial_domain(void) { - return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); -} + int irq; -static int __init pci_xen_initial_domain(void) -{ #ifdef CONFIG_PCI_MSI x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; #endif xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; - - return 0; -} - -void __init xen_setup_pirqs(void) -{ - int pirq, irq; - - pci_xen_initial_domain(); - - if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { - pirq = xen_allocate_pirq_gsi(irq); - if (WARN(pirq < 0, - "Could not allocate PIRQ for legacy interrupt\n")) - break; - irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); - } - return; - } - /* Pre-allocate legacy irqs */ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { int trigger, polarity; @@ -490,12 +437,16 @@ void __init xen_setup_pirqs(void) continue; xen_register_pirq(irq, -1 /* no GSI override */, - trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); + trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, + true /* Map GSI to PIRQ */); } + if (0 == nr_ioapics) { + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) + xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic"); + } + return 0; } -#endif -#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; struct pci_dev *dev; diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 17c565de3d64..a6575b949b11 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,5 +18,5 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o - +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5525163a0398..53257421082b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1248,6 +1248,14 @@ asmlinkage void __init xen_start_kernel(void) if (pci_xen) x86_init.pci.arch_init = pci_xen_init; } else { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + + xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + /* Make sure ACS will be enabled */ pci_request_acs(); } diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 25c52f94a27c..ffcf2615640b 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); #ifdef CONFIG_XEN_PVHVM static int xen_emul_unplug; -static int __init check_platform_magic(void) +static int check_platform_magic(void) { short magic; char protocol; diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c new file mode 100644 index 000000000000..1cd7f4d11e29 --- /dev/null +++ b/arch/x86/xen/vga.c @@ -0,0 +1,67 @@ +#include +#include + +#include +#include + +#include + +#include "xen-ops.h" + +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +{ + struct screen_info *screen_info = &boot_params.screen_info; + + /* This is drawn from a dump from vgacon:startup in + * standard Linux. */ + screen_info->orig_video_mode = 3; + screen_info->orig_video_isVGA = 1; + screen_info->orig_video_lines = 25; + screen_info->orig_video_cols = 80; + screen_info->orig_video_ega_bx = 3; + screen_info->orig_video_points = 16; + screen_info->orig_y = screen_info->orig_video_lines - 1; + + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) + + sizeof(info->u.text_mode_3)) + break; + screen_info->orig_video_lines = info->u.text_mode_3.rows; + screen_info->orig_video_cols = info->u.text_mode_3.columns; + screen_info->orig_x = info->u.text_mode_3.cursor_x; + screen_info->orig_y = info->u.text_mode_3.cursor_y; + screen_info->orig_video_points = + info->u.text_mode_3.font_height; + break; + + case XEN_VGATYPE_VESA_LFB: + if (size < offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps)) + break; + screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info->lfb_width = info->u.vesa_lfb.width; + screen_info->lfb_height = info->u.vesa_lfb.height; + screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info->lfb_base = info->u.vesa_lfb.lfb_base; + screen_info->lfb_size = info->u.vesa_lfb.lfb_size; + screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info->red_size = info->u.vesa_lfb.red_size; + screen_info->red_pos = info->u.vesa_lfb.red_pos; + screen_info->green_size = info->u.vesa_lfb.green_size; + screen_info->green_pos = info->u.vesa_lfb.green_pos; + screen_info->blue_size = info->u.vesa_lfb.blue_size; + screen_info->blue_pos = info->u.vesa_lfb.blue_pos; + screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps) + + sizeof(info->u.vesa_lfb.gbl_caps)) + screen_info->capabilities = info->u.vesa_lfb.gbl_caps; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.mode_attrs) + + sizeof(info->u.vesa_lfb.mode_attrs)) + screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; + break; + } +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 97dfdc8757b3..b095739ccd4c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu) } #endif +struct dom0_vga_console_info; + +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#else +static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, + size_t size) +{ +} +#endif + /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 6cc0db1bf522..3f129b45451a 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -684,7 +684,7 @@ again: err = xenbus_switch_state(dev, XenbusStateConnected); if (err) - xenbus_dev_fatal(dev, err, "switching to Connected state", + xenbus_dev_fatal(dev, err, "%s: switching to Connected state", dev->nodename); return; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f245c588ae95..ce7914c4c044 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -262,6 +262,7 @@ enum intel_pch { }; #define QUIRK_PIPEA_FORCE (1<<0) +#define QUIRK_LVDS_SSC_DISABLE (1<<1) struct intel_fbdev; @@ -1194,7 +1195,9 @@ void i915_gem_free_all_phys_object(struct drm_device *dev); void i915_gem_release(struct drm_device *dev, struct drm_file *file); uint32_t -i915_gem_get_unfenced_gtt_alignment(struct drm_i915_gem_object *obj); +i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev, + uint32_t size, + int tiling_mode); /* i915_gem_gtt.c */ void i915_gem_restore_gtt_mappings(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 5c0d1247f453..a087e1bf0c2f 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1374,25 +1374,24 @@ i915_gem_free_mmap_offset(struct drm_i915_gem_object *obj) } static uint32_t -i915_gem_get_gtt_size(struct drm_i915_gem_object *obj) +i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode) { - struct drm_device *dev = obj->base.dev; - uint32_t size; + uint32_t gtt_size; if (INTEL_INFO(dev)->gen >= 4 || - obj->tiling_mode == I915_TILING_NONE) - return obj->base.size; + tiling_mode == I915_TILING_NONE) + return size; /* Previous chips need a power-of-two fence region when tiling */ if (INTEL_INFO(dev)->gen == 3) - size = 1024*1024; + gtt_size = 1024*1024; else - size = 512*1024; + gtt_size = 512*1024; - while (size < obj->base.size) - size <<= 1; + while (gtt_size < size) + gtt_size <<= 1; - return size; + return gtt_size; } /** @@ -1403,59 +1402,52 @@ i915_gem_get_gtt_size(struct drm_i915_gem_object *obj) * potential fence register mapping. */ static uint32_t -i915_gem_get_gtt_alignment(struct drm_i915_gem_object *obj) +i915_gem_get_gtt_alignment(struct drm_device *dev, + uint32_t size, + int tiling_mode) { - struct drm_device *dev = obj->base.dev; - /* * Minimum alignment is 4k (GTT page size), but might be greater * if a fence register is needed for the object. */ if (INTEL_INFO(dev)->gen >= 4 || - obj->tiling_mode == I915_TILING_NONE) + tiling_mode == I915_TILING_NONE) return 4096; /* * Previous chips need to be aligned to the size of the smallest * fence register that can contain the object. */ - return i915_gem_get_gtt_size(obj); + return i915_gem_get_gtt_size(dev, size, tiling_mode); } /** * i915_gem_get_unfenced_gtt_alignment - return required GTT alignment for an * unfenced object - * @obj: object to check + * @dev: the device + * @size: size of the object + * @tiling_mode: tiling mode of the object * * Return the required GTT alignment for an object, only taking into account * unfenced tiled surface requirements. */ uint32_t -i915_gem_get_unfenced_gtt_alignment(struct drm_i915_gem_object *obj) +i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev, + uint32_t size, + int tiling_mode) { - struct drm_device *dev = obj->base.dev; - int tile_height; - /* * Minimum alignment is 4k (GTT page size) for sane hw. */ if (INTEL_INFO(dev)->gen >= 4 || IS_G33(dev) || - obj->tiling_mode == I915_TILING_NONE) + tiling_mode == I915_TILING_NONE) return 4096; - /* - * Older chips need unfenced tiled buffers to be aligned to the left - * edge of an even tile row (where tile rows are counted as if the bo is - * placed in a fenced gtt region). + /* Previous hardware however needs to be aligned to a power-of-two + * tile height. The simplest method for determining this is to reuse + * the power-of-tile object size. */ - if (IS_GEN2(dev)) - tile_height = 16; - else if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev)) - tile_height = 32; - else - tile_height = 8; - - return tile_height * obj->stride * 2; + return i915_gem_get_gtt_size(dev, size, tiling_mode); } int @@ -2744,9 +2736,16 @@ i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj, return -EINVAL; } - fence_size = i915_gem_get_gtt_size(obj); - fence_alignment = i915_gem_get_gtt_alignment(obj); - unfenced_alignment = i915_gem_get_unfenced_gtt_alignment(obj); + fence_size = i915_gem_get_gtt_size(dev, + obj->base.size, + obj->tiling_mode); + fence_alignment = i915_gem_get_gtt_alignment(dev, + obj->base.size, + obj->tiling_mode); + unfenced_alignment = + i915_gem_get_unfenced_gtt_alignment(dev, + obj->base.size, + obj->tiling_mode); if (alignment == 0) alignment = map_and_fenceable ? fence_alignment : diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c index 82d70fd9e933..99c4faa59d8f 100644 --- a/drivers/gpu/drm/i915/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c @@ -348,7 +348,9 @@ i915_gem_set_tiling(struct drm_device *dev, void *data, /* Rebind if we need a change of alignment */ if (!obj->map_and_fenceable) { u32 unfenced_alignment = - i915_gem_get_unfenced_gtt_alignment(obj); + i915_gem_get_unfenced_gtt_alignment(dev, + obj->base.size, + args->tiling_mode); if (obj->gtt_offset & (unfenced_alignment - 1)) ret = i915_gem_object_unbind(obj); } diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 21b6f93fe919..0f1c799afea1 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -4305,7 +4305,8 @@ static void intel_update_watermarks(struct drm_device *dev) static inline bool intel_panel_use_ssc(struct drm_i915_private *dev_priv) { - return dev_priv->lvds_use_ssc && i915_panel_use_ssc; + return dev_priv->lvds_use_ssc && i915_panel_use_ssc + && !(dev_priv->quirks & QUIRK_LVDS_SSC_DISABLE); } static int i9xx_crtc_mode_set(struct drm_crtc *crtc, @@ -7810,6 +7811,15 @@ static void quirk_pipea_force (struct drm_device *dev) DRM_DEBUG_DRIVER("applying pipe a force quirk\n"); } +/* + * Some machines (Lenovo U160) do not work with SSC on LVDS for some reason + */ +static void quirk_ssc_force_disable(struct drm_device *dev) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + dev_priv->quirks |= QUIRK_LVDS_SSC_DISABLE; +} + struct intel_quirk { int device; int subsystem_vendor; @@ -7838,6 +7848,9 @@ struct intel_quirk intel_quirks[] = { /* 855 & before need to leave pipe A & dpll A up */ { 0x3582, PCI_ANY_ID, PCI_ANY_ID, quirk_pipea_force }, { 0x2562, PCI_ANY_ID, PCI_ANY_ID, quirk_pipea_force }, + + /* Lenovo U160 cannot use SSC on LVDS */ + { 0x0046, 0x17aa, 0x3920, quirk_ssc_force_disable }, }; static void intel_init_quirks(struct drm_device *dev) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e37..2535933c49f8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -117,7 +117,7 @@ static __init int map_switcher(void) /* * Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). + * Copy in the compiled-in Switcher code (from x86/switcher_32.S). */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index daaf86631647..28433a155d67 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num) /* * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and - * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. + * device not available (TS handling) and of course, the hypercall trap. */ - return num != 14 && num != 13 && num != 7 && - num != 6 && num != LGUEST_TRAP_ENTRY; + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; } /*:*/ @@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu) /* * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the IDT entries to use that - * stack. The IDT entries expect a virtual address, so unlike most addresses + * a different kernel stack, so we can change the guest TSS to use that + * stack. The TSS entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd5..295df06e6590 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -59,6 +59,8 @@ struct lg_cpu { struct lguest_pages *last_pages; + /* Initialization mode: linear map everything. */ + bool linear_pages; int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88ea..5289ffa2e500 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -108,6 +108,17 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } +/* + * To notify on reset or feature finalization, we (ab)use the NOTIFY + * hypercall, with the descriptor address of the device. + */ +static void status_notify(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); +} + /* * The virtio core takes the features the Host offers, and copies the ones * supported by the driver into the vdev->features array. Once that's all @@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev) if (test_bit(i, vdev->features)) out_features[i / 8] |= (1 << (i % 8)); } + + /* Tell Host we've finished with this device's feature negotiation */ + status_notify(vdev); } /* Once they've found a field, getting a copy of it is easy. */ @@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev) return to_lgdev(vdev)->desc->status; } -/* - * To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". - */ -static void set_status(struct virtio_device *vdev, u8 status) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - /* We set the status. */ - to_lgdev(vdev)->desc->status = status; - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - static void lg_set_status(struct virtio_device *vdev, u8 status) { BUG_ON(!status); - set_status(vdev, status); + to_lgdev(vdev)->desc->status = status; + + /* Tell Host immediately if we failed. */ + if (status & VIRTIO_CONFIG_S_FAILED) + status_notify(vdev); } static void lg_reset(struct virtio_device *vdev) { - set_status(vdev, 0); + /* 0 status means "reset" */ + to_lgdev(vdev)->desc->status = 0; + status_notify(vdev); } /* diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9e..f97e625241ad 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher - * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout and entry point. A read will run the - * Guest until something happens, such as a signal or the Guest doing a NOTIFY - * out to the Launcher. +/*P:200 This contains all the /dev/lguest code, whereby the userspace + * launcher controls and communicates with the Guest. For example, + * the first write will tell us the Guest's memory layout and entry + * point. A read will run the Guest until something happens, such as + * a signal or the Guest doing a NOTIFY out to the Launcher. There is + * also a way for the Launcher to attach eventfds to particular NOTIFY + * values instead of returning from the read() call. :*/ #include #include @@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input) goto free_eventfds; /* - * Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. + * Initialize the Guest's shadow page tables. This allocates + * memory, so can fail. */ err = init_guest_pagetable(lg); if (err) @@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = { .read = read, .llseek = default_llseek, }; +/*:*/ /* * This is a textbook example of a "misc" character device. Populate a "struct diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95de..3b62be160a6e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "lg.h" /*M:008 @@ -156,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -196,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 +/*M:007 * get_pfn is slow: we could probably try to grab batches of pages here as * an optimization (ie. pre-faulting). :*/ @@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) #endif /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + } /* Now look at the matching shadow entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); @@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) } #ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpmd = __pmd(_PAGE_TABLE); + } else { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + } /* Now look at the matching shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); @@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif - /* Read the actual PTE value. */ - gpte = lgread(cpu, gpte_ptr, pte_t); + if (unlikely(cpu->linear_pages)) { + /* Linear? Make up a PTE which points to same page. */ + gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); + } else { + /* Read the actual PTE value. */ + gpte = lgread(cpu, gpte_ptr, pte_t); + } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite(cpu, gpte_ptr, pte_t, gpte); + if (likely(!cpu->linear_pages)) + lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping @@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif + + /* Still not set up? Just map 1:1. */ + if (unlikely(cpu->linear_pages)) + return vaddr; + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 - * (iv) Switching page tables - * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. - */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) -{ - int newpgdir, repin = 0; - - /* Look to see if we have this one already. */ - newpgdir = find_pgdir(cpu->lg, pgtable); - /* - * If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. - */ - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) - newpgdir = new_pgdir(cpu, pgtable, &repin); - /* Change the current pgd index to the new one. */ - cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ - if (repin) - pin_stack_pages(cpu); -} - /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); } + +/*H:430 + * (iv) Switching page tables + * + * Now we've seen all the page table setting and manipulation, let's see + * what happens when the Guest changes page tables (ie. changes the top-level + * pgdir). This occurs on almost every context switch. + */ +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +{ + int newpgdir, repin = 0; + + /* + * The very first time they call this, we're actually running without + * any page tables; we've been making it up. Throw them away now. + */ + if (unlikely(cpu->linear_pages)) { + release_all_pagetables(cpu->lg); + cpu->linear_pages = false; + /* Force allocation of a new pgdir. */ + newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); + } else { + /* Look to see if we have this one already. */ + newpgdir = find_pgdir(cpu->lg, pgtable); + } + + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ + if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) + newpgdir = new_pgdir(cpu, pgtable, &repin); + /* Change the current pgd index to the new one. */ + cpu->cpu_pgd = newpgdir; + /* If it was completely blank, we map in the Guest kernel stack */ + if (repin) + pin_stack_pages(cpu); +} /*:*/ /*M:009 @@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/*H:505 - * To get through boot, we construct simple identity page mappings (which - * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. The linear mapping means we - * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, - * as you'll see. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). - */ -static unsigned long setup_pagetables(struct lguest *lg, - unsigned long mem, - unsigned long initrd_size) -{ - pgd_t __user *pgdir; - pte_t __user *linear; - unsigned long mem_base = (unsigned long)lg->mem_base; - unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE - pmd_t __user *pmds; - unsigned int j; - pgd_t pgd; - pmd_t pmd; -#else - unsigned int phys_linear; -#endif - - /* - * We have mapped_pages frames to map, so we need linear_pages page - * tables to map them. - */ - mapped_pages = mem / PAGE_SIZE; - linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; - - /* We put the toplevel page directory page at the top of memory. */ - pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); - - /* Now we use the next linear_pages pages as pte pages */ - linear = (void *)pgdir - linear_pages * PAGE_SIZE; - -#ifdef CONFIG_X86_PAE - /* - * And the single mid page goes below that. We only use one, but - * that's enough to map 1G, which definitely gets us through boot. - */ - pmds = (void *)linear - PAGE_SIZE; -#endif - /* - * Linear mapping is easy: put every page's address into the - * mapping in order. - */ - for (i = 0; i < mapped_pages; i++) { - pte_t pte; - pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); - if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) - return -EFAULT; - } - -#ifdef CONFIG_X86_PAE - /* - * Make the Guest PMD entries point to the corresponding place in the - * linear mapping (up to one page worth of PMD). - */ - for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; - i += PTRS_PER_PTE, j++) { - pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) - return -EFAULT; - } - - /* One PGD entry, pointing to that PMD page. */ - pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); - /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ - if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) - return -EFAULT; - /* - * And the other PGD entry to make the linear mapping at PAGE_OFFSET - */ - if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) - return -EFAULT; -#else - /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. - */ - phys_linear = (unsigned long)linear - mem_base; - for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { - pgd_t pgd; - /* - * Create a PGD entry which points to the right part of the - * linear PTE pages. - */ - pgd = __pgd((phys_linear + i * sizeof(pte_t)) | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - /* - * Copy it into the PGD page at 0 and PAGE_OFFSET. - */ - if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) - || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) - + i / PTRS_PER_PTE], - &pgd, sizeof(pgd))) - return -EFAULT; - } -#endif - - /* - * We return the top level (guest-physical) address: we remember where - * this is to write it into lguest_data when the Guest initializes. - */ - return (unsigned long)pgdir - mem_base; -} - /*H:500 * (vii) Setting up the page tables initially. * - * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: + * When a Guest is first created, set initialize a shadow page table which + * we will populate on future faults. The Guest doesn't have any actual + * pagetables yet, so we set linear_pages to tell demand_page() to fake it + * for the moment. */ int init_guest_pagetable(struct lguest *lg) { - u64 mem; - u32 initrd_size; - struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE - pgd_t *pgd; - pmd_t *pmd_table; -#endif - /* - * Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). - */ - if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) - || get_user(initrd_size, &boot->hdr.ramdisk_size)) - return -EFAULT; + struct lg_cpu *cpu = &lg->cpus[0]; + int allocated = 0; - /* - * We start on the first shadow page table, and give it a blank PGD - * page. - */ - lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); - if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) - return lg->pgdirs[0].gpgdir; - lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); - if (!lg->pgdirs[0].pgdir) + /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ + cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); + if (!allocated) return -ENOMEM; -#ifdef CONFIG_X86_PAE - /* For PAE, we also create the initial mid-level. */ - pgd = lg->pgdirs[0].pgdir; - pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); - if (!pmd_table) - return -ENOMEM; - - set_pgd(pgd + SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif - - /* This is the current page table. */ - lg->cpus[0].cpu_pgd = 0; + /* We start with a linear mapping until the initialize. */ + cpu->linear_pages = true; return 0; } @@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * of virtual addresses used by the Switcher. */ || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, - &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + return; + } /* * In flush_user_mappings() we loop from 0 to diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 9f1659c3d1f3..65af42f2d593 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) static int emulate_insn(struct lg_cpu *cpu) { u8 insn; - unsigned int insnlen = 0, in = 0, shift = 0; + unsigned int insnlen = 0, in = 0, small_operand = 0; /* * The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. + * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu) } /* - * 0x66 is an "operand prefix". It means it's using the upper 16 bits - * of the eax register. + * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. */ if (insn == 0x66) { - shift = 16; + small_operand = 1; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); @@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu) * traditionally means "there's nothing there". */ if (in) { - /* Lower bit tells is whether it's a 16 or 32 bit access */ - if (insn & 0x1) - cpu->regs->eax = 0xFFFFFFFF; - else - cpu->regs->eax |= (0xFFFF << shift); + /* Lower bit tells means it's a 32/16 bit access */ + if (insn & 0x1) { + if (small_operand) + cpu->regs->eax |= 0xFFFF; + else + cpu->regs->eax = 0xFFFFFFFF; + } else + cpu->regs->eax |= 0xFF; } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; @@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* - * Our hypercalls mechanism used to be based on direct software interrupts. - * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to - * change over to using kvm hypercalls. - * - * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid - * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be - * an *emulation approach*: if the fault was really produced by an hypercall - * (is_hypercall() does exactly this check), we can just call the corresponding - * hypercall host implementation function. - * - * But these invalid opcode faults are notably slower than software interrupts. - * So we implemented the *patching (or rewriting) approach*: every time we hit - * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" - * opcode, so next time the Guest calls this hypercall it will use the - * faster trap mechanism. - * - * Matias even benchmarked it to convince you: this shows the average cycle - * cost of a hypercall. For each alternative solution mentioned above we've - * made 5 runs of the benchmark: - * - * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 - * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 - * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 - * - * One two-line function is worth a 20% hypercall speed boost! - */ -static void rewrite_hypercall(struct lg_cpu *cpu) -{ - /* - * This are the opcodes we use to patch the Guest. The opcode for "int - * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). - */ - u8 insn[3] = {0xcd, 0x1f, 0x90}; - - __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* - * The above write might have caused a copy of that page to be made - * (if it was read-only). We need to make sure the Guest has - * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. - */ - guest_pagetable_clear_all(cpu); -} - -static bool is_hypercall(struct lg_cpu *cpu) -{ - u8 insn[3]; - - /* - * This must be the Guest kernel trying to do something. - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((cpu->regs->cs & 3) != GUEST_PL) - return false; - - /* Is it a vmcall? */ - __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); - return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; -} - /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ void lguest_arch_handle_trap(struct lg_cpu *cpu) { @@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) if (emulate_insn(cpu)) return; } - /* - * If KVM is active, the vmcall instruction triggers a General - * Protection Fault. Normally it triggers an invalid opcode - * fault (6): - */ - case 6: - /* - * We need to check if ring == GUEST_PL and faulting - * instruction == vmcall. - */ - if (is_hypercall(cpu)) { - rewrite_hypercall(cpu); - return; - } break; case 14: /* We've intercepted a Page Fault. */ /* @@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again + * return to run the Guest again. */ cond_resched(); return; @@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void) int i; /* - * Most of the i386/switcher.S doesn't care that it's been moved; on + * Most of the x86/switcher_32.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); } put_online_cpus(); -}; +} /*:*/ void __exit lguest_arch_host_fini(void) @@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) /*:*/ /*L:030 - * lguest_arch_setup_regs() - * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index f85e42224559..1ff5486213fb 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -106,6 +106,16 @@ struct mmc_blk_data { static DEFINE_MUTEX(open_lock); +enum mmc_blk_status { + MMC_BLK_SUCCESS = 0, + MMC_BLK_PARTIAL, + MMC_BLK_RETRY, + MMC_BLK_RETRY_SINGLE, + MMC_BLK_DATA_ERR, + MMC_BLK_CMD_ERR, + MMC_BLK_ABORT, +}; + module_param(perdev_minors, int, 0444); MODULE_PARM_DESC(perdev_minors, "Minors numbers to allocate per device"); @@ -427,14 +437,6 @@ static const struct block_device_operations mmc_bdops = { #endif }; -struct mmc_blk_request { - struct mmc_request mrq; - struct mmc_command sbc; - struct mmc_command cmd; - struct mmc_command stop; - struct mmc_data data; -}; - static inline int mmc_blk_part_switch(struct mmc_card *card, struct mmc_blk_data *md) { @@ -525,7 +527,20 @@ static u32 mmc_sd_num_wr_blocks(struct mmc_card *card) return result; } -static u32 get_card_status(struct mmc_card *card, struct request *req) +static int send_stop(struct mmc_card *card, u32 *status) +{ + struct mmc_command cmd = {0}; + int err; + + cmd.opcode = MMC_STOP_TRANSMISSION; + cmd.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; + err = mmc_wait_for_cmd(card->host, &cmd, 5); + if (err == 0) + *status = cmd.resp[0]; + return err; +} + +static int get_card_status(struct mmc_card *card, u32 *status, int retries) { struct mmc_command cmd = {0}; int err; @@ -534,11 +549,141 @@ static u32 get_card_status(struct mmc_card *card, struct request *req) if (!mmc_host_is_spi(card->host)) cmd.arg = card->rca << 16; cmd.flags = MMC_RSP_SPI_R2 | MMC_RSP_R1 | MMC_CMD_AC; - err = mmc_wait_for_cmd(card->host, &cmd, 0); + err = mmc_wait_for_cmd(card->host, &cmd, retries); + if (err == 0) + *status = cmd.resp[0]; + return err; +} + +#define ERR_RETRY 2 +#define ERR_ABORT 1 +#define ERR_CONTINUE 0 + +static int mmc_blk_cmd_error(struct request *req, const char *name, int error, + bool status_valid, u32 status) +{ + switch (error) { + case -EILSEQ: + /* response crc error, retry the r/w cmd */ + pr_err("%s: %s sending %s command, card status %#x\n", + req->rq_disk->disk_name, "response CRC error", + name, status); + return ERR_RETRY; + + case -ETIMEDOUT: + pr_err("%s: %s sending %s command, card status %#x\n", + req->rq_disk->disk_name, "timed out", name, status); + + /* If the status cmd initially failed, retry the r/w cmd */ + if (!status_valid) + return ERR_RETRY; + + /* + * If it was a r/w cmd crc error, or illegal command + * (eg, issued in wrong state) then retry - we should + * have corrected the state problem above. + */ + if (status & (R1_COM_CRC_ERROR | R1_ILLEGAL_COMMAND)) + return ERR_RETRY; + + /* Otherwise abort the command */ + return ERR_ABORT; + + default: + /* We don't understand the error code the driver gave us */ + pr_err("%s: unknown error %d sending read/write command, card status %#x\n", + req->rq_disk->disk_name, error, status); + return ERR_ABORT; + } +} + +/* + * Initial r/w and stop cmd error recovery. + * We don't know whether the card received the r/w cmd or not, so try to + * restore things back to a sane state. Essentially, we do this as follows: + * - Obtain card status. If the first attempt to obtain card status fails, + * the status word will reflect the failed status cmd, not the failed + * r/w cmd. If we fail to obtain card status, it suggests we can no + * longer communicate with the card. + * - Check the card state. If the card received the cmd but there was a + * transient problem with the response, it might still be in a data transfer + * mode. Try to send it a stop command. If this fails, we can't recover. + * - If the r/w cmd failed due to a response CRC error, it was probably + * transient, so retry the cmd. + * - If the r/w cmd timed out, but we didn't get the r/w cmd status, retry. + * - If the r/w cmd timed out, and the r/w cmd failed due to CRC error or + * illegal cmd, retry. + * Otherwise we don't understand what happened, so abort. + */ +static int mmc_blk_cmd_recovery(struct mmc_card *card, struct request *req, + struct mmc_blk_request *brq) +{ + bool prev_cmd_status_valid = true; + u32 status, stop_status = 0; + int err, retry; + + /* + * Try to get card status which indicates both the card state + * and why there was no response. If the first attempt fails, + * we can't be sure the returned status is for the r/w command. + */ + for (retry = 2; retry >= 0; retry--) { + err = get_card_status(card, &status, 0); + if (!err) + break; + + prev_cmd_status_valid = false; + pr_err("%s: error %d sending status command, %sing\n", + req->rq_disk->disk_name, err, retry ? "retry" : "abort"); + } + + /* We couldn't get a response from the card. Give up. */ if (err) - printk(KERN_ERR "%s: error %d sending status command", - req->rq_disk->disk_name, err); - return cmd.resp[0]; + return ERR_ABORT; + + /* + * Check the current card state. If it is in some data transfer + * mode, tell it to stop (and hopefully transition back to TRAN.) + */ + if (R1_CURRENT_STATE(status) == R1_STATE_DATA || + R1_CURRENT_STATE(status) == R1_STATE_RCV) { + err = send_stop(card, &stop_status); + if (err) + pr_err("%s: error %d sending stop command\n", + req->rq_disk->disk_name, err); + + /* + * If the stop cmd also timed out, the card is probably + * not present, so abort. Other errors are bad news too. + */ + if (err) + return ERR_ABORT; + } + + /* Check for set block count errors */ + if (brq->sbc.error) + return mmc_blk_cmd_error(req, "SET_BLOCK_COUNT", brq->sbc.error, + prev_cmd_status_valid, status); + + /* Check for r/w command errors */ + if (brq->cmd.error) + return mmc_blk_cmd_error(req, "r/w cmd", brq->cmd.error, + prev_cmd_status_valid, status); + + /* Now for stop errors. These aren't fatal to the transfer. */ + pr_err("%s: error %d sending stop command, original cmd response %#x, card status %#x\n", + req->rq_disk->disk_name, brq->stop.error, + brq->cmd.resp[0], status); + + /* + * Subsitute in our own stop status as this will give the error + * state which happened during the execution of the r/w command. + */ + if (stop_status) { + brq->stop.resp[0] = stop_status; + brq->stop.error = 0; + } + return ERR_CONTINUE; } static int mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) @@ -669,240 +814,324 @@ static inline void mmc_apply_rel_rw(struct mmc_blk_request *brq, } } -static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req) +#define CMD_ERRORS \ + (R1_OUT_OF_RANGE | /* Command argument out of range */ \ + R1_ADDRESS_ERROR | /* Misaligned address */ \ + R1_BLOCK_LEN_ERROR | /* Transferred block length incorrect */\ + R1_WP_VIOLATION | /* Tried to write to protected block */ \ + R1_CC_ERROR | /* Card controller error */ \ + R1_ERROR) /* General/unknown error */ + +static int mmc_blk_err_check(struct mmc_card *card, + struct mmc_async_req *areq) { - struct mmc_blk_data *md = mq->data; - struct mmc_card *card = md->queue.card; - struct mmc_blk_request brq; - int ret = 1, disable_multi = 0; + enum mmc_blk_status ret = MMC_BLK_SUCCESS; + struct mmc_queue_req *mq_mrq = container_of(areq, struct mmc_queue_req, + mmc_active); + struct mmc_blk_request *brq = &mq_mrq->brq; + struct request *req = mq_mrq->req; /* - * Reliable writes are used to implement Forced Unit Access and - * REQ_META accesses, and are supported only on MMCs. + * sbc.error indicates a problem with the set block count + * command. No data will have been transferred. + * + * cmd.error indicates a problem with the r/w command. No + * data will have been transferred. + * + * stop.error indicates a problem with the stop command. Data + * may have been transferred, or may still be transferring. */ - bool do_rel_wr = ((req->cmd_flags & REQ_FUA) || - (req->cmd_flags & REQ_META)) && - (rq_data_dir(req) == WRITE) && - (md->flags & MMC_BLK_REL_WR); + if (brq->sbc.error || brq->cmd.error || brq->stop.error) { + switch (mmc_blk_cmd_recovery(card, req, brq)) { + case ERR_RETRY: + return MMC_BLK_RETRY; + case ERR_ABORT: + return MMC_BLK_ABORT; + case ERR_CONTINUE: + break; + } + } - do { - struct mmc_command cmd = {0}; - u32 readcmd, writecmd, status = 0; - - memset(&brq, 0, sizeof(struct mmc_blk_request)); - brq.mrq.cmd = &brq.cmd; - brq.mrq.data = &brq.data; - - brq.cmd.arg = blk_rq_pos(req); - if (!mmc_card_blockaddr(card)) - brq.cmd.arg <<= 9; - brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; - brq.data.blksz = 512; - brq.stop.opcode = MMC_STOP_TRANSMISSION; - brq.stop.arg = 0; - brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; - brq.data.blocks = blk_rq_sectors(req); + /* + * Check for errors relating to the execution of the + * initial command - such as address errors. No data + * has been transferred. + */ + if (brq->cmd.resp[0] & CMD_ERRORS) { + pr_err("%s: r/w command failed, status = %#x\n", + req->rq_disk->disk_name, brq->cmd.resp[0]); + return MMC_BLK_ABORT; + } - /* - * The block layer doesn't support all sector count - * restrictions, so we need to be prepared for too big - * requests. - */ - if (brq.data.blocks > card->host->max_blk_count) - brq.data.blocks = card->host->max_blk_count; + /* + * Everything else is either success, or a data error of some + * kind. If it was a write, we may have transitioned to + * program mode, which we have to wait for it to complete. + */ + if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ) { + u32 status; + do { + int err = get_card_status(card, &status, 5); + if (err) { + printk(KERN_ERR "%s: error %d requesting status\n", + req->rq_disk->disk_name, err); + return MMC_BLK_CMD_ERR; + } + /* + * Some cards mishandle the status bits, + * so make sure to check both the busy + * indication and the card state. + */ + } while (!(status & R1_READY_FOR_DATA) || + (R1_CURRENT_STATE(status) == R1_STATE_PRG)); + } - /* - * After a read error, we redo the request one sector at a time - * in order to accurately determine which sectors can be read - * successfully. - */ - if (disable_multi && brq.data.blocks > 1) - brq.data.blocks = 1; + if (brq->data.error) { + pr_err("%s: error %d transferring data, sector %u, nr %u, cmd response %#x, card status %#x\n", + req->rq_disk->disk_name, brq->data.error, + (unsigned)blk_rq_pos(req), + (unsigned)blk_rq_sectors(req), + brq->cmd.resp[0], brq->stop.resp[0]); - if (brq.data.blocks > 1 || do_rel_wr) { - /* SPI multiblock writes terminate using a special - * token, not a STOP_TRANSMISSION request. - */ - if (!mmc_host_is_spi(card->host) || - rq_data_dir(req) == READ) - brq.mrq.stop = &brq.stop; - readcmd = MMC_READ_MULTIPLE_BLOCK; - writecmd = MMC_WRITE_MULTIPLE_BLOCK; - } else { - brq.mrq.stop = NULL; - readcmd = MMC_READ_SINGLE_BLOCK; - writecmd = MMC_WRITE_BLOCK; - } if (rq_data_dir(req) == READ) { - brq.cmd.opcode = readcmd; - brq.data.flags |= MMC_DATA_READ; + if (brq->data.blocks > 1) { + /* Redo read one sector at a time */ + pr_warning("%s: retrying using single block read\n", + req->rq_disk->disk_name); + return MMC_BLK_RETRY_SINGLE; + } + return MMC_BLK_DATA_ERR; } else { - brq.cmd.opcode = writecmd; - brq.data.flags |= MMC_DATA_WRITE; + return MMC_BLK_CMD_ERR; } + } - if (do_rel_wr) - mmc_apply_rel_rw(&brq, card, req); + if (ret == MMC_BLK_SUCCESS && + blk_rq_bytes(req) != brq->data.bytes_xfered) + ret = MMC_BLK_PARTIAL; - /* - * Pre-defined multi-block transfers are preferable to - * open ended-ones (and necessary for reliable writes). - * However, it is not sufficient to just send CMD23, - * and avoid the final CMD12, as on an error condition - * CMD12 (stop) needs to be sent anyway. This, coupled - * with Auto-CMD23 enhancements provided by some - * hosts, means that the complexity of dealing - * with this is best left to the host. If CMD23 is - * supported by card and host, we'll fill sbc in and let - * the host deal with handling it correctly. This means - * that for hosts that don't expose MMC_CAP_CMD23, no - * change of behavior will be observed. - * - * N.B: Some MMC cards experience perf degradation. - * We'll avoid using CMD23-bounded multiblock writes for - * these, while retaining features like reliable writes. - */ + return ret; +} - if ((md->flags & MMC_BLK_CMD23) && - mmc_op_multi(brq.cmd.opcode) && - (do_rel_wr || !(card->quirks & MMC_QUIRK_BLK_NO_CMD23))) { - brq.sbc.opcode = MMC_SET_BLOCK_COUNT; - brq.sbc.arg = brq.data.blocks | - (do_rel_wr ? (1 << 31) : 0); - brq.sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; - brq.mrq.sbc = &brq.sbc; - } +static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, + struct mmc_card *card, + int disable_multi, + struct mmc_queue *mq) +{ + u32 readcmd, writecmd; + struct mmc_blk_request *brq = &mqrq->brq; + struct request *req = mqrq->req; + struct mmc_blk_data *md = mq->data; - mmc_set_data_timeout(&brq.data, card); + /* + * Reliable writes are used to implement Forced Unit Access and + * REQ_META accesses, and are supported only on MMCs. + */ + bool do_rel_wr = ((req->cmd_flags & REQ_FUA) || + (req->cmd_flags & REQ_META)) && + (rq_data_dir(req) == WRITE) && + (md->flags & MMC_BLK_REL_WR); - brq.data.sg = mq->sg; - brq.data.sg_len = mmc_queue_map_sg(mq); + memset(brq, 0, sizeof(struct mmc_blk_request)); + brq->mrq.cmd = &brq->cmd; + brq->mrq.data = &brq->data; - /* - * Adjust the sg list so it is the same size as the - * request. + brq->cmd.arg = blk_rq_pos(req); + if (!mmc_card_blockaddr(card)) + brq->cmd.arg <<= 9; + brq->cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + brq->data.blksz = 512; + brq->stop.opcode = MMC_STOP_TRANSMISSION; + brq->stop.arg = 0; + brq->stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; + brq->data.blocks = blk_rq_sectors(req); + + /* + * The block layer doesn't support all sector count + * restrictions, so we need to be prepared for too big + * requests. + */ + if (brq->data.blocks > card->host->max_blk_count) + brq->data.blocks = card->host->max_blk_count; + + /* + * After a read error, we redo the request one sector at a time + * in order to accurately determine which sectors can be read + * successfully. + */ + if (disable_multi && brq->data.blocks > 1) + brq->data.blocks = 1; + + if (brq->data.blocks > 1 || do_rel_wr) { + /* SPI multiblock writes terminate using a special + * token, not a STOP_TRANSMISSION request. */ - if (brq.data.blocks != blk_rq_sectors(req)) { - int i, data_size = brq.data.blocks << 9; - struct scatterlist *sg; - - for_each_sg(brq.data.sg, sg, brq.data.sg_len, i) { - data_size -= sg->length; - if (data_size <= 0) { - sg->length += data_size; - i++; - break; - } - } - brq.data.sg_len = i; - } + if (!mmc_host_is_spi(card->host) || + rq_data_dir(req) == READ) + brq->mrq.stop = &brq->stop; + readcmd = MMC_READ_MULTIPLE_BLOCK; + writecmd = MMC_WRITE_MULTIPLE_BLOCK; + } else { + brq->mrq.stop = NULL; + readcmd = MMC_READ_SINGLE_BLOCK; + writecmd = MMC_WRITE_BLOCK; + } + if (rq_data_dir(req) == READ) { + brq->cmd.opcode = readcmd; + brq->data.flags |= MMC_DATA_READ; + } else { + brq->cmd.opcode = writecmd; + brq->data.flags |= MMC_DATA_WRITE; + } - mmc_queue_bounce_pre(mq); + if (do_rel_wr) + mmc_apply_rel_rw(brq, card, req); - mmc_wait_for_req(card->host, &brq.mrq); + /* + * Pre-defined multi-block transfers are preferable to + * open ended-ones (and necessary for reliable writes). + * However, it is not sufficient to just send CMD23, + * and avoid the final CMD12, as on an error condition + * CMD12 (stop) needs to be sent anyway. This, coupled + * with Auto-CMD23 enhancements provided by some + * hosts, means that the complexity of dealing + * with this is best left to the host. If CMD23 is + * supported by card and host, we'll fill sbc in and let + * the host deal with handling it correctly. This means + * that for hosts that don't expose MMC_CAP_CMD23, no + * change of behavior will be observed. + * + * N.B: Some MMC cards experience perf degradation. + * We'll avoid using CMD23-bounded multiblock writes for + * these, while retaining features like reliable writes. + */ - mmc_queue_bounce_post(mq); + if ((md->flags & MMC_BLK_CMD23) && + mmc_op_multi(brq->cmd.opcode) && + (do_rel_wr || !(card->quirks & MMC_QUIRK_BLK_NO_CMD23))) { + brq->sbc.opcode = MMC_SET_BLOCK_COUNT; + brq->sbc.arg = brq->data.blocks | + (do_rel_wr ? (1 << 31) : 0); + brq->sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; + brq->mrq.sbc = &brq->sbc; + } - /* - * Check for errors here, but don't jump to cmd_err - * until later as we need to wait for the card to leave - * programming mode even when things go wrong. - */ - if (brq.sbc.error || brq.cmd.error || - brq.data.error || brq.stop.error) { - if (brq.data.blocks > 1 && rq_data_dir(req) == READ) { - /* Redo read one sector at a time */ - printk(KERN_WARNING "%s: retrying using single " - "block read\n", req->rq_disk->disk_name); - disable_multi = 1; - continue; - } - status = get_card_status(card, req); - } + mmc_set_data_timeout(&brq->data, card); - if (brq.sbc.error) { - printk(KERN_ERR "%s: error %d sending SET_BLOCK_COUNT " - "command, response %#x, card status %#x\n", - req->rq_disk->disk_name, brq.sbc.error, - brq.sbc.resp[0], status); - } + brq->data.sg = mqrq->sg; + brq->data.sg_len = mmc_queue_map_sg(mq, mqrq); - if (brq.cmd.error) { - printk(KERN_ERR "%s: error %d sending read/write " - "command, response %#x, card status %#x\n", - req->rq_disk->disk_name, brq.cmd.error, - brq.cmd.resp[0], status); + /* + * Adjust the sg list so it is the same size as the + * request. + */ + if (brq->data.blocks != blk_rq_sectors(req)) { + int i, data_size = brq->data.blocks << 9; + struct scatterlist *sg; + + for_each_sg(brq->data.sg, sg, brq->data.sg_len, i) { + data_size -= sg->length; + if (data_size <= 0) { + sg->length += data_size; + i++; + break; + } } + brq->data.sg_len = i; + } - if (brq.data.error) { - if (brq.data.error == -ETIMEDOUT && brq.mrq.stop) - /* 'Stop' response contains card status */ - status = brq.mrq.stop->resp[0]; - printk(KERN_ERR "%s: error %d transferring data," - " sector %u, nr %u, card status %#x\n", - req->rq_disk->disk_name, brq.data.error, - (unsigned)blk_rq_pos(req), - (unsigned)blk_rq_sectors(req), status); - } + mqrq->mmc_active.mrq = &brq->mrq; + mqrq->mmc_active.err_check = mmc_blk_err_check; - if (brq.stop.error) { - printk(KERN_ERR "%s: error %d sending stop command, " - "response %#x, card status %#x\n", - req->rq_disk->disk_name, brq.stop.error, - brq.stop.resp[0], status); - } + mmc_queue_bounce_pre(mqrq); +} - if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ) { - do { - int err; - - cmd.opcode = MMC_SEND_STATUS; - cmd.arg = card->rca << 16; - cmd.flags = MMC_RSP_R1 | MMC_CMD_AC; - err = mmc_wait_for_cmd(card->host, &cmd, 5); - if (err) { - printk(KERN_ERR "%s: error %d requesting status\n", - req->rq_disk->disk_name, err); - goto cmd_err; - } - /* - * Some cards mishandle the status bits, - * so make sure to check both the busy - * indication and the card state. - */ - } while (!(cmd.resp[0] & R1_READY_FOR_DATA) || - (R1_CURRENT_STATE(cmd.resp[0]) == 7)); - -#if 0 - if (cmd.resp[0] & ~0x00000900) - printk(KERN_ERR "%s: status = %08x\n", - req->rq_disk->disk_name, cmd.resp[0]); - if (mmc_decode_status(cmd.resp)) - goto cmd_err; -#endif - } +static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) +{ + struct mmc_blk_data *md = mq->data; + struct mmc_card *card = md->queue.card; + struct mmc_blk_request *brq = &mq->mqrq_cur->brq; + int ret = 1, disable_multi = 0, retry = 0; + enum mmc_blk_status status; + struct mmc_queue_req *mq_rq; + struct request *req; + struct mmc_async_req *areq; + + if (!rqc && !mq->mqrq_prev->req) + return 0; - if (brq.cmd.error || brq.stop.error || brq.data.error) { - if (rq_data_dir(req) == READ) { + do { + if (rqc) { + mmc_blk_rw_rq_prep(mq->mqrq_cur, card, 0, mq); + areq = &mq->mqrq_cur->mmc_active; + } else + areq = NULL; + areq = mmc_start_req(card->host, areq, (int *) &status); + if (!areq) + return 0; + + mq_rq = container_of(areq, struct mmc_queue_req, mmc_active); + brq = &mq_rq->brq; + req = mq_rq->req; + mmc_queue_bounce_post(mq_rq); + + switch (status) { + case MMC_BLK_SUCCESS: + case MMC_BLK_PARTIAL: + /* + * A block was successfully transferred. + */ + spin_lock_irq(&md->lock); + ret = __blk_end_request(req, 0, + brq->data.bytes_xfered); + spin_unlock_irq(&md->lock); + if (status == MMC_BLK_SUCCESS && ret) { /* - * After an error, we redo I/O one sector at a - * time, so we only reach here after trying to - * read a single sector. + * The blk_end_request has returned non zero + * even though all data is transfered and no + * erros returned by host. + * If this happen it's a bug. */ - spin_lock_irq(&md->lock); - ret = __blk_end_request(req, -EIO, brq.data.blksz); - spin_unlock_irq(&md->lock); - continue; + printk(KERN_ERR "%s BUG rq_tot %d d_xfer %d\n", + __func__, blk_rq_bytes(req), + brq->data.bytes_xfered); + rqc = NULL; + goto cmd_abort; } + break; + case MMC_BLK_CMD_ERR: goto cmd_err; + case MMC_BLK_RETRY_SINGLE: + disable_multi = 1; + break; + case MMC_BLK_RETRY: + if (retry++ < 5) + break; + case MMC_BLK_ABORT: + goto cmd_abort; + case MMC_BLK_DATA_ERR: + /* + * After an error, we redo I/O one sector at a + * time, so we only reach here after trying to + * read a single sector. + */ + spin_lock_irq(&md->lock); + ret = __blk_end_request(req, -EIO, + brq->data.blksz); + spin_unlock_irq(&md->lock); + if (!ret) + goto start_new_req; + break; } - /* - * A block was successfully transferred. - */ - spin_lock_irq(&md->lock); - ret = __blk_end_request(req, 0, brq.data.bytes_xfered); - spin_unlock_irq(&md->lock); + if (ret) { + /* + * In case of a none complete request + * prepare it again and resend. + */ + mmc_blk_rw_rq_prep(mq_rq, card, disable_multi, mq); + mmc_start_req(card->host, &mq_rq->mmc_active, NULL); + } } while (ret); return 1; @@ -927,15 +1156,22 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req) } } else { spin_lock_irq(&md->lock); - ret = __blk_end_request(req, 0, brq.data.bytes_xfered); + ret = __blk_end_request(req, 0, brq->data.bytes_xfered); spin_unlock_irq(&md->lock); } + cmd_abort: spin_lock_irq(&md->lock); while (ret) ret = __blk_end_request(req, -EIO, blk_rq_cur_bytes(req)); spin_unlock_irq(&md->lock); + start_new_req: + if (rqc) { + mmc_blk_rw_rq_prep(mq->mqrq_cur, card, 0, mq); + mmc_start_req(card->host, &mq->mqrq_cur->mmc_active, NULL); + } + return 0; } @@ -945,26 +1181,37 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) struct mmc_blk_data *md = mq->data; struct mmc_card *card = md->queue.card; - mmc_claim_host(card->host); + if (req && !mq->mqrq_prev->req) + /* claim host only for the first request */ + mmc_claim_host(card->host); + ret = mmc_blk_part_switch(card, md); if (ret) { ret = 0; goto out; } - if (req->cmd_flags & REQ_DISCARD) { + if (req && req->cmd_flags & REQ_DISCARD) { + /* complete ongoing async transfer before issuing discard */ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); if (req->cmd_flags & REQ_SECURE) ret = mmc_blk_issue_secdiscard_rq(mq, req); else ret = mmc_blk_issue_discard_rq(mq, req); - } else if (req->cmd_flags & REQ_FLUSH) { + } else if (req && req->cmd_flags & REQ_FLUSH) { + /* complete ongoing async transfer before issuing flush */ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); ret = mmc_blk_issue_flush(mq, req); } else { ret = mmc_blk_issue_rw_rq(mq, req); } out: - mmc_release_host(card->host); + if (!req) + /* release host only when there are no more requests */ + mmc_release_host(card->host); return ret; } diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c index 233cdfae92f4..006a5e9f8ab8 100644 --- a/drivers/mmc/card/mmc_test.c +++ b/drivers/mmc/card/mmc_test.c @@ -148,6 +148,27 @@ struct mmc_test_card { struct mmc_test_general_result *gr; }; +enum mmc_test_prep_media { + MMC_TEST_PREP_NONE = 0, + MMC_TEST_PREP_WRITE_FULL = 1 << 0, + MMC_TEST_PREP_ERASE = 1 << 1, +}; + +struct mmc_test_multiple_rw { + unsigned int *sg_len; + unsigned int *bs; + unsigned int len; + unsigned int size; + bool do_write; + bool do_nonblock_req; + enum mmc_test_prep_media prepare; +}; + +struct mmc_test_async_req { + struct mmc_async_req areq; + struct mmc_test_card *test; +}; + /*******************************************************************/ /* General helper functions */ /*******************************************************************/ @@ -367,21 +388,26 @@ out_free: * Map memory into a scatterlist. Optionally allow the same memory to be * mapped more than once. */ -static int mmc_test_map_sg(struct mmc_test_mem *mem, unsigned long sz, +static int mmc_test_map_sg(struct mmc_test_mem *mem, unsigned long size, struct scatterlist *sglist, int repeat, unsigned int max_segs, unsigned int max_seg_sz, - unsigned int *sg_len) + unsigned int *sg_len, int min_sg_len) { struct scatterlist *sg = NULL; unsigned int i; + unsigned long sz = size; sg_init_table(sglist, max_segs); + if (min_sg_len > max_segs) + min_sg_len = max_segs; *sg_len = 0; do { for (i = 0; i < mem->cnt; i++) { unsigned long len = PAGE_SIZE << mem->arr[i].order; + if (min_sg_len && (size / min_sg_len < len)) + len = ALIGN(size / min_sg_len, 512); if (len > sz) len = sz; if (len > max_seg_sz) @@ -554,11 +580,12 @@ static void mmc_test_print_avg_rate(struct mmc_test_card *test, uint64_t bytes, printk(KERN_INFO "%s: Transfer of %u x %u sectors (%u x %u%s KiB) took " "%lu.%09lu seconds (%u kB/s, %u KiB/s, " - "%u.%02u IOPS)\n", + "%u.%02u IOPS, sg_len %d)\n", mmc_hostname(test->card->host), count, sectors, count, sectors >> 1, (sectors & 1 ? ".5" : ""), (unsigned long)ts.tv_sec, (unsigned long)ts.tv_nsec, - rate / 1000, rate / 1024, iops / 100, iops % 100); + rate / 1000, rate / 1024, iops / 100, iops % 100, + test->area.sg_len); mmc_test_save_transfer_result(test, count, sectors, ts, rate, iops); } @@ -661,7 +688,7 @@ static void mmc_test_prepare_broken_mrq(struct mmc_test_card *test, * Checks that a normal transfer didn't have any errors */ static int mmc_test_check_result(struct mmc_test_card *test, - struct mmc_request *mrq) + struct mmc_request *mrq) { int ret; @@ -685,6 +712,17 @@ static int mmc_test_check_result(struct mmc_test_card *test, return ret; } +static int mmc_test_check_result_async(struct mmc_card *card, + struct mmc_async_req *areq) +{ + struct mmc_test_async_req *test_async = + container_of(areq, struct mmc_test_async_req, areq); + + mmc_test_wait_busy(test_async->test); + + return mmc_test_check_result(test_async->test, areq->mrq); +} + /* * Checks that a "short transfer" behaved as expected */ @@ -719,6 +757,85 @@ static int mmc_test_check_broken_result(struct mmc_test_card *test, return ret; } +/* + * Tests nonblock transfer with certain parameters + */ +static void mmc_test_nonblock_reset(struct mmc_request *mrq, + struct mmc_command *cmd, + struct mmc_command *stop, + struct mmc_data *data) +{ + memset(mrq, 0, sizeof(struct mmc_request)); + memset(cmd, 0, sizeof(struct mmc_command)); + memset(data, 0, sizeof(struct mmc_data)); + memset(stop, 0, sizeof(struct mmc_command)); + + mrq->cmd = cmd; + mrq->data = data; + mrq->stop = stop; +} +static int mmc_test_nonblock_transfer(struct mmc_test_card *test, + struct scatterlist *sg, unsigned sg_len, + unsigned dev_addr, unsigned blocks, + unsigned blksz, int write, int count) +{ + struct mmc_request mrq1; + struct mmc_command cmd1; + struct mmc_command stop1; + struct mmc_data data1; + + struct mmc_request mrq2; + struct mmc_command cmd2; + struct mmc_command stop2; + struct mmc_data data2; + + struct mmc_test_async_req test_areq[2]; + struct mmc_async_req *done_areq; + struct mmc_async_req *cur_areq = &test_areq[0].areq; + struct mmc_async_req *other_areq = &test_areq[1].areq; + int i; + int ret; + + test_areq[0].test = test; + test_areq[1].test = test; + + mmc_test_nonblock_reset(&mrq1, &cmd1, &stop1, &data1); + mmc_test_nonblock_reset(&mrq2, &cmd2, &stop2, &data2); + + cur_areq->mrq = &mrq1; + cur_areq->err_check = mmc_test_check_result_async; + other_areq->mrq = &mrq2; + other_areq->err_check = mmc_test_check_result_async; + + for (i = 0; i < count; i++) { + mmc_test_prepare_mrq(test, cur_areq->mrq, sg, sg_len, dev_addr, + blocks, blksz, write); + done_areq = mmc_start_req(test->card->host, cur_areq, &ret); + + if (ret || (!done_areq && i > 0)) + goto err; + + if (done_areq) { + if (done_areq->mrq == &mrq2) + mmc_test_nonblock_reset(&mrq2, &cmd2, + &stop2, &data2); + else + mmc_test_nonblock_reset(&mrq1, &cmd1, + &stop1, &data1); + } + done_areq = cur_areq; + cur_areq = other_areq; + other_areq = done_areq; + dev_addr += blocks; + } + + done_areq = mmc_start_req(test->card->host, NULL, &ret); + + return ret; +err: + return ret; +} + /* * Tests a basic transfer with certain parameters */ @@ -1302,7 +1419,7 @@ static int mmc_test_no_highmem(struct mmc_test_card *test) * Map sz bytes so that it can be transferred. */ static int mmc_test_area_map(struct mmc_test_card *test, unsigned long sz, - int max_scatter) + int max_scatter, int min_sg_len) { struct mmc_test_area *t = &test->area; int err; @@ -1315,7 +1432,7 @@ static int mmc_test_area_map(struct mmc_test_card *test, unsigned long sz, &t->sg_len); } else { err = mmc_test_map_sg(t->mem, sz, t->sg, 1, t->max_segs, - t->max_seg_sz, &t->sg_len); + t->max_seg_sz, &t->sg_len, min_sg_len); } if (err) printk(KERN_INFO "%s: Failed to map sg list\n", @@ -1336,14 +1453,17 @@ static int mmc_test_area_transfer(struct mmc_test_card *test, } /* - * Map and transfer bytes. + * Map and transfer bytes for multiple transfers. */ -static int mmc_test_area_io(struct mmc_test_card *test, unsigned long sz, - unsigned int dev_addr, int write, int max_scatter, - int timed) +static int mmc_test_area_io_seq(struct mmc_test_card *test, unsigned long sz, + unsigned int dev_addr, int write, + int max_scatter, int timed, int count, + bool nonblock, int min_sg_len) { struct timespec ts1, ts2; - int ret; + int ret = 0; + int i; + struct mmc_test_area *t = &test->area; /* * In the case of a maximally scattered transfer, the maximum transfer @@ -1361,14 +1481,21 @@ static int mmc_test_area_io(struct mmc_test_card *test, unsigned long sz, sz = max_tfr; } - ret = mmc_test_area_map(test, sz, max_scatter); + ret = mmc_test_area_map(test, sz, max_scatter, min_sg_len); if (ret) return ret; if (timed) getnstimeofday(&ts1); + if (nonblock) + ret = mmc_test_nonblock_transfer(test, t->sg, t->sg_len, + dev_addr, t->blocks, 512, write, count); + else + for (i = 0; i < count && ret == 0; i++) { + ret = mmc_test_area_transfer(test, dev_addr, write); + dev_addr += sz >> 9; + } - ret = mmc_test_area_transfer(test, dev_addr, write); if (ret) return ret; @@ -1376,11 +1503,19 @@ static int mmc_test_area_io(struct mmc_test_card *test, unsigned long sz, getnstimeofday(&ts2); if (timed) - mmc_test_print_rate(test, sz, &ts1, &ts2); + mmc_test_print_avg_rate(test, sz, count, &ts1, &ts2); return 0; } +static int mmc_test_area_io(struct mmc_test_card *test, unsigned long sz, + unsigned int dev_addr, int write, int max_scatter, + int timed) +{ + return mmc_test_area_io_seq(test, sz, dev_addr, write, max_scatter, + timed, 1, false, 0); +} + /* * Write the test area entirely. */ @@ -1954,6 +2089,245 @@ static int mmc_test_large_seq_write_perf(struct mmc_test_card *test) return mmc_test_large_seq_perf(test, 1); } +static int mmc_test_rw_multiple(struct mmc_test_card *test, + struct mmc_test_multiple_rw *tdata, + unsigned int reqsize, unsigned int size, + int min_sg_len) +{ + unsigned int dev_addr; + struct mmc_test_area *t = &test->area; + int ret = 0; + + /* Set up test area */ + if (size > mmc_test_capacity(test->card) / 2 * 512) + size = mmc_test_capacity(test->card) / 2 * 512; + if (reqsize > t->max_tfr) + reqsize = t->max_tfr; + dev_addr = mmc_test_capacity(test->card) / 4; + if ((dev_addr & 0xffff0000)) + dev_addr &= 0xffff0000; /* Round to 64MiB boundary */ + else + dev_addr &= 0xfffff800; /* Round to 1MiB boundary */ + if (!dev_addr) + goto err; + + if (reqsize > size) + return 0; + + /* prepare test area */ + if (mmc_can_erase(test->card) && + tdata->prepare & MMC_TEST_PREP_ERASE) { + ret = mmc_erase(test->card, dev_addr, + size / 512, MMC_SECURE_ERASE_ARG); + if (ret) + ret = mmc_erase(test->card, dev_addr, + size / 512, MMC_ERASE_ARG); + if (ret) + goto err; + } + + /* Run test */ + ret = mmc_test_area_io_seq(test, reqsize, dev_addr, + tdata->do_write, 0, 1, size / reqsize, + tdata->do_nonblock_req, min_sg_len); + if (ret) + goto err; + + return ret; + err: + printk(KERN_INFO "[%s] error\n", __func__); + return ret; +} + +static int mmc_test_rw_multiple_size(struct mmc_test_card *test, + struct mmc_test_multiple_rw *rw) +{ + int ret = 0; + int i; + void *pre_req = test->card->host->ops->pre_req; + void *post_req = test->card->host->ops->post_req; + + if (rw->do_nonblock_req && + ((!pre_req && post_req) || (pre_req && !post_req))) { + printk(KERN_INFO "error: only one of pre/post is defined\n"); + return -EINVAL; + } + + for (i = 0 ; i < rw->len && ret == 0; i++) { + ret = mmc_test_rw_multiple(test, rw, rw->bs[i], rw->size, 0); + if (ret) + break; + } + return ret; +} + +static int mmc_test_rw_multiple_sg_len(struct mmc_test_card *test, + struct mmc_test_multiple_rw *rw) +{ + int ret = 0; + int i; + + for (i = 0 ; i < rw->len && ret == 0; i++) { + ret = mmc_test_rw_multiple(test, rw, 512*1024, rw->size, + rw->sg_len[i]); + if (ret) + break; + } + return ret; +} + +/* + * Multiple blocking write 4k to 4 MB chunks + */ +static int mmc_test_profile_mult_write_blocking_perf(struct mmc_test_card *test) +{ + unsigned int bs[] = {1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, + 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 22}; + struct mmc_test_multiple_rw test_data = { + .bs = bs, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(bs), + .do_write = true, + .do_nonblock_req = false, + .prepare = MMC_TEST_PREP_ERASE, + }; + + return mmc_test_rw_multiple_size(test, &test_data); +}; + +/* + * Multiple non-blocking write 4k to 4 MB chunks + */ +static int mmc_test_profile_mult_write_nonblock_perf(struct mmc_test_card *test) +{ + unsigned int bs[] = {1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, + 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 22}; + struct mmc_test_multiple_rw test_data = { + .bs = bs, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(bs), + .do_write = true, + .do_nonblock_req = true, + .prepare = MMC_TEST_PREP_ERASE, + }; + + return mmc_test_rw_multiple_size(test, &test_data); +} + +/* + * Multiple blocking read 4k to 4 MB chunks + */ +static int mmc_test_profile_mult_read_blocking_perf(struct mmc_test_card *test) +{ + unsigned int bs[] = {1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, + 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 22}; + struct mmc_test_multiple_rw test_data = { + .bs = bs, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(bs), + .do_write = false, + .do_nonblock_req = false, + .prepare = MMC_TEST_PREP_NONE, + }; + + return mmc_test_rw_multiple_size(test, &test_data); +} + +/* + * Multiple non-blocking read 4k to 4 MB chunks + */ +static int mmc_test_profile_mult_read_nonblock_perf(struct mmc_test_card *test) +{ + unsigned int bs[] = {1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, + 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 22}; + struct mmc_test_multiple_rw test_data = { + .bs = bs, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(bs), + .do_write = false, + .do_nonblock_req = true, + .prepare = MMC_TEST_PREP_NONE, + }; + + return mmc_test_rw_multiple_size(test, &test_data); +} + +/* + * Multiple blocking write 1 to 512 sg elements + */ +static int mmc_test_profile_sglen_wr_blocking_perf(struct mmc_test_card *test) +{ + unsigned int sg_len[] = {1, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1 << 7, 1 << 8, 1 << 9}; + struct mmc_test_multiple_rw test_data = { + .sg_len = sg_len, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(sg_len), + .do_write = true, + .do_nonblock_req = false, + .prepare = MMC_TEST_PREP_ERASE, + }; + + return mmc_test_rw_multiple_sg_len(test, &test_data); +}; + +/* + * Multiple non-blocking write 1 to 512 sg elements + */ +static int mmc_test_profile_sglen_wr_nonblock_perf(struct mmc_test_card *test) +{ + unsigned int sg_len[] = {1, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1 << 7, 1 << 8, 1 << 9}; + struct mmc_test_multiple_rw test_data = { + .sg_len = sg_len, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(sg_len), + .do_write = true, + .do_nonblock_req = true, + .prepare = MMC_TEST_PREP_ERASE, + }; + + return mmc_test_rw_multiple_sg_len(test, &test_data); +} + +/* + * Multiple blocking read 1 to 512 sg elements + */ +static int mmc_test_profile_sglen_r_blocking_perf(struct mmc_test_card *test) +{ + unsigned int sg_len[] = {1, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1 << 7, 1 << 8, 1 << 9}; + struct mmc_test_multiple_rw test_data = { + .sg_len = sg_len, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(sg_len), + .do_write = false, + .do_nonblock_req = false, + .prepare = MMC_TEST_PREP_NONE, + }; + + return mmc_test_rw_multiple_sg_len(test, &test_data); +} + +/* + * Multiple non-blocking read 1 to 512 sg elements + */ +static int mmc_test_profile_sglen_r_nonblock_perf(struct mmc_test_card *test) +{ + unsigned int sg_len[] = {1, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1 << 7, 1 << 8, 1 << 9}; + struct mmc_test_multiple_rw test_data = { + .sg_len = sg_len, + .size = TEST_AREA_MAX_SIZE, + .len = ARRAY_SIZE(sg_len), + .do_write = false, + .do_nonblock_req = true, + .prepare = MMC_TEST_PREP_NONE, + }; + + return mmc_test_rw_multiple_sg_len(test, &test_data); +} + static const struct mmc_test_case mmc_test_cases[] = { { .name = "Basic write (no data verification)", @@ -2221,6 +2595,61 @@ static const struct mmc_test_case mmc_test_cases[] = { .cleanup = mmc_test_area_cleanup, }, + { + .name = "Write performance with blocking req 4k to 4MB", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_mult_write_blocking_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Write performance with non-blocking req 4k to 4MB", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_mult_write_nonblock_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Read performance with blocking req 4k to 4MB", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_mult_read_blocking_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Read performance with non-blocking req 4k to 4MB", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_mult_read_nonblock_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Write performance blocking req 1 to 512 sg elems", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_sglen_wr_blocking_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Write performance non-blocking req 1 to 512 sg elems", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_sglen_wr_nonblock_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Read performance blocking req 1 to 512 sg elems", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_sglen_r_blocking_perf, + .cleanup = mmc_test_area_cleanup, + }, + + { + .name = "Read performance non-blocking req 1 to 512 sg elems", + .prepare = mmc_test_area_prepare, + .run = mmc_test_profile_sglen_r_nonblock_perf, + .cleanup = mmc_test_area_cleanup, + }, }; static DEFINE_MUTEX(mmc_test_lock); @@ -2445,6 +2874,32 @@ static const struct file_operations mmc_test_fops_test = { .release = single_release, }; +static int mtf_testlist_show(struct seq_file *sf, void *data) +{ + int i; + + mutex_lock(&mmc_test_lock); + + for (i = 0; i < ARRAY_SIZE(mmc_test_cases); i++) + seq_printf(sf, "%d:\t%s\n", i+1, mmc_test_cases[i].name); + + mutex_unlock(&mmc_test_lock); + + return 0; +} + +static int mtf_testlist_open(struct inode *inode, struct file *file) +{ + return single_open(file, mtf_testlist_show, inode->i_private); +} + +static const struct file_operations mmc_test_fops_testlist = { + .open = mtf_testlist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void mmc_test_free_file_test(struct mmc_card *card) { struct mmc_test_dbgfs_file *df, *dfs; @@ -2476,7 +2931,18 @@ static int mmc_test_register_file_test(struct mmc_card *card) if (IS_ERR_OR_NULL(file)) { dev_err(&card->dev, - "Can't create file. Perhaps debugfs is disabled.\n"); + "Can't create test. Perhaps debugfs is disabled.\n"); + ret = -ENODEV; + goto err; + } + + if (card->debugfs_root) + file = debugfs_create_file("testlist", S_IRUGO, + card->debugfs_root, card, &mmc_test_fops_testlist); + + if (IS_ERR_OR_NULL(file)) { + dev_err(&card->dev, + "Can't create testlist. Perhaps debugfs is disabled.\n"); ret = -ENODEV; goto err; } diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 6413afa318d2..45fb362e3f01 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -52,14 +52,18 @@ static int mmc_queue_thread(void *d) down(&mq->thread_sem); do { struct request *req = NULL; + struct mmc_queue_req *tmp; spin_lock_irq(q->queue_lock); set_current_state(TASK_INTERRUPTIBLE); req = blk_fetch_request(q); - mq->req = req; + mq->mqrq_cur->req = req; spin_unlock_irq(q->queue_lock); - if (!req) { + if (req || mq->mqrq_prev->req) { + set_current_state(TASK_RUNNING); + mq->issue_fn(mq, req); + } else { if (kthread_should_stop()) { set_current_state(TASK_RUNNING); break; @@ -67,11 +71,14 @@ static int mmc_queue_thread(void *d) up(&mq->thread_sem); schedule(); down(&mq->thread_sem); - continue; } - set_current_state(TASK_RUNNING); - mq->issue_fn(mq, req); + /* Current request becomes previous request and vice versa. */ + mq->mqrq_prev->brq.mrq.data = NULL; + mq->mqrq_prev->req = NULL; + tmp = mq->mqrq_prev; + mq->mqrq_prev = mq->mqrq_cur; + mq->mqrq_cur = tmp; } while (1); up(&mq->thread_sem); @@ -97,10 +104,46 @@ static void mmc_request(struct request_queue *q) return; } - if (!mq->req) + if (!mq->mqrq_cur->req && !mq->mqrq_prev->req) wake_up_process(mq->thread); } +struct scatterlist *mmc_alloc_sg(int sg_len, int *err) +{ + struct scatterlist *sg; + + sg = kmalloc(sizeof(struct scatterlist)*sg_len, GFP_KERNEL); + if (!sg) + *err = -ENOMEM; + else { + *err = 0; + sg_init_table(sg, sg_len); + } + + return sg; +} + +static void mmc_queue_setup_discard(struct request_queue *q, + struct mmc_card *card) +{ + unsigned max_discard; + + max_discard = mmc_calc_max_discard(card); + if (!max_discard) + return; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + q->limits.max_discard_sectors = max_discard; + if (card->erased_byte == 0) + q->limits.discard_zeroes_data = 1; + q->limits.discard_granularity = card->pref_erase << 9; + /* granularity must not be greater than max. discard */ + if (card->pref_erase > max_discard) + q->limits.discard_granularity = 0; + if (mmc_can_secure_erase_trim(card)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); +} + /** * mmc_init_queue - initialise a queue structure. * @mq: mmc queue @@ -116,6 +159,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, struct mmc_host *host = card->host; u64 limit = BLK_BOUNCE_HIGH; int ret; + struct mmc_queue_req *mqrq_cur = &mq->mqrq[0]; + struct mmc_queue_req *mqrq_prev = &mq->mqrq[1]; if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = *mmc_dev(host)->dma_mask; @@ -125,21 +170,16 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (!mq->queue) return -ENOMEM; + memset(&mq->mqrq_cur, 0, sizeof(mq->mqrq_cur)); + memset(&mq->mqrq_prev, 0, sizeof(mq->mqrq_prev)); + mq->mqrq_cur = mqrq_cur; + mq->mqrq_prev = mqrq_prev; mq->queue->queuedata = mq; - mq->req = NULL; blk_queue_prep_rq(mq->queue, mmc_prep_request); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); - if (mmc_can_erase(card)) { - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue); - mq->queue->limits.max_discard_sectors = UINT_MAX; - if (card->erased_byte == 0) - mq->queue->limits.discard_zeroes_data = 1; - mq->queue->limits.discard_granularity = card->pref_erase << 9; - if (mmc_can_secure_erase_trim(card)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, - mq->queue); - } + if (mmc_can_erase(card)) + mmc_queue_setup_discard(mq->queue, card); #ifdef CONFIG_MMC_BLOCK_BOUNCE if (host->max_segs == 1) { @@ -155,53 +195,64 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, bouncesz = host->max_blk_count * 512; if (bouncesz > 512) { - mq->bounce_buf = kmalloc(bouncesz, GFP_KERNEL); - if (!mq->bounce_buf) { + mqrq_cur->bounce_buf = kmalloc(bouncesz, GFP_KERNEL); + if (!mqrq_cur->bounce_buf) { + printk(KERN_WARNING "%s: unable to " + "allocate bounce cur buffer\n", + mmc_card_name(card)); + } + mqrq_prev->bounce_buf = kmalloc(bouncesz, GFP_KERNEL); + if (!mqrq_prev->bounce_buf) { printk(KERN_WARNING "%s: unable to " - "allocate bounce buffer\n", + "allocate bounce prev buffer\n", mmc_card_name(card)); + kfree(mqrq_cur->bounce_buf); + mqrq_cur->bounce_buf = NULL; } } - if (mq->bounce_buf) { + if (mqrq_cur->bounce_buf && mqrq_prev->bounce_buf) { blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(mq->queue, bouncesz / 512); blk_queue_max_segments(mq->queue, bouncesz / 512); blk_queue_max_segment_size(mq->queue, bouncesz); - mq->sg = kmalloc(sizeof(struct scatterlist), - GFP_KERNEL); - if (!mq->sg) { - ret = -ENOMEM; + mqrq_cur->sg = mmc_alloc_sg(1, &ret); + if (ret) goto cleanup_queue; - } - sg_init_table(mq->sg, 1); - mq->bounce_sg = kmalloc(sizeof(struct scatterlist) * - bouncesz / 512, GFP_KERNEL); - if (!mq->bounce_sg) { - ret = -ENOMEM; + mqrq_cur->bounce_sg = + mmc_alloc_sg(bouncesz / 512, &ret); + if (ret) + goto cleanup_queue; + + mqrq_prev->sg = mmc_alloc_sg(1, &ret); + if (ret) + goto cleanup_queue; + + mqrq_prev->bounce_sg = + mmc_alloc_sg(bouncesz / 512, &ret); + if (ret) goto cleanup_queue; - } - sg_init_table(mq->bounce_sg, bouncesz / 512); } } #endif - if (!mq->bounce_buf) { + if (!mqrq_cur->bounce_buf && !mqrq_prev->bounce_buf) { blk_queue_bounce_limit(mq->queue, limit); blk_queue_max_hw_sectors(mq->queue, min(host->max_blk_count, host->max_req_size / 512)); blk_queue_max_segments(mq->queue, host->max_segs); blk_queue_max_segment_size(mq->queue, host->max_seg_size); - mq->sg = kmalloc(sizeof(struct scatterlist) * - host->max_segs, GFP_KERNEL); - if (!mq->sg) { - ret = -ENOMEM; + mqrq_cur->sg = mmc_alloc_sg(host->max_segs, &ret); + if (ret) + goto cleanup_queue; + + + mqrq_prev->sg = mmc_alloc_sg(host->max_segs, &ret); + if (ret) goto cleanup_queue; - } - sg_init_table(mq->sg, host->max_segs); } sema_init(&mq->thread_sem, 1); @@ -216,16 +267,22 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, return 0; free_bounce_sg: - if (mq->bounce_sg) - kfree(mq->bounce_sg); - mq->bounce_sg = NULL; + kfree(mqrq_cur->bounce_sg); + mqrq_cur->bounce_sg = NULL; + kfree(mqrq_prev->bounce_sg); + mqrq_prev->bounce_sg = NULL; + cleanup_queue: - if (mq->sg) - kfree(mq->sg); - mq->sg = NULL; - if (mq->bounce_buf) - kfree(mq->bounce_buf); - mq->bounce_buf = NULL; + kfree(mqrq_cur->sg); + mqrq_cur->sg = NULL; + kfree(mqrq_cur->bounce_buf); + mqrq_cur->bounce_buf = NULL; + + kfree(mqrq_prev->sg); + mqrq_prev->sg = NULL; + kfree(mqrq_prev->bounce_buf); + mqrq_prev->bounce_buf = NULL; + blk_cleanup_queue(mq->queue); return ret; } @@ -234,6 +291,8 @@ void mmc_cleanup_queue(struct mmc_queue *mq) { struct request_queue *q = mq->queue; unsigned long flags; + struct mmc_queue_req *mqrq_cur = mq->mqrq_cur; + struct mmc_queue_req *mqrq_prev = mq->mqrq_prev; /* Make sure the queue isn't suspended, as that will deadlock */ mmc_queue_resume(mq); @@ -247,16 +306,23 @@ void mmc_cleanup_queue(struct mmc_queue *mq) blk_start_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); - if (mq->bounce_sg) - kfree(mq->bounce_sg); - mq->bounce_sg = NULL; + kfree(mqrq_cur->bounce_sg); + mqrq_cur->bounce_sg = NULL; - kfree(mq->sg); - mq->sg = NULL; + kfree(mqrq_cur->sg); + mqrq_cur->sg = NULL; - if (mq->bounce_buf) - kfree(mq->bounce_buf); - mq->bounce_buf = NULL; + kfree(mqrq_cur->bounce_buf); + mqrq_cur->bounce_buf = NULL; + + kfree(mqrq_prev->bounce_sg); + mqrq_prev->bounce_sg = NULL; + + kfree(mqrq_prev->sg); + mqrq_prev->sg = NULL; + + kfree(mqrq_prev->bounce_buf); + mqrq_prev->bounce_buf = NULL; mq->card = NULL; } @@ -309,27 +375,27 @@ void mmc_queue_resume(struct mmc_queue *mq) /* * Prepare the sg list(s) to be handed of to the host driver */ -unsigned int mmc_queue_map_sg(struct mmc_queue *mq) +unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { unsigned int sg_len; size_t buflen; struct scatterlist *sg; int i; - if (!mq->bounce_buf) - return blk_rq_map_sg(mq->queue, mq->req, mq->sg); + if (!mqrq->bounce_buf) + return blk_rq_map_sg(mq->queue, mqrq->req, mqrq->sg); - BUG_ON(!mq->bounce_sg); + BUG_ON(!mqrq->bounce_sg); - sg_len = blk_rq_map_sg(mq->queue, mq->req, mq->bounce_sg); + sg_len = blk_rq_map_sg(mq->queue, mqrq->req, mqrq->bounce_sg); - mq->bounce_sg_len = sg_len; + mqrq->bounce_sg_len = sg_len; buflen = 0; - for_each_sg(mq->bounce_sg, sg, sg_len, i) + for_each_sg(mqrq->bounce_sg, sg, sg_len, i) buflen += sg->length; - sg_init_one(mq->sg, mq->bounce_buf, buflen); + sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen); return 1; } @@ -338,31 +404,30 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq) * If writing, bounce the data to the buffer before the request * is sent to the host driver */ -void mmc_queue_bounce_pre(struct mmc_queue *mq) +void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq) { - if (!mq->bounce_buf) + if (!mqrq->bounce_buf) return; - if (rq_data_dir(mq->req) != WRITE) + if (rq_data_dir(mqrq->req) != WRITE) return; - sg_copy_to_buffer(mq->bounce_sg, mq->bounce_sg_len, - mq->bounce_buf, mq->sg[0].length); + sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, + mqrq->bounce_buf, mqrq->sg[0].length); } /* * If reading, bounce the data from the buffer after the request * has been handled by the host driver */ -void mmc_queue_bounce_post(struct mmc_queue *mq) +void mmc_queue_bounce_post(struct mmc_queue_req *mqrq) { - if (!mq->bounce_buf) + if (!mqrq->bounce_buf) return; - if (rq_data_dir(mq->req) != READ) + if (rq_data_dir(mqrq->req) != READ) return; - sg_copy_from_buffer(mq->bounce_sg, mq->bounce_sg_len, - mq->bounce_buf, mq->sg[0].length); + sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, + mqrq->bounce_buf, mqrq->sg[0].length); } - diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index 6223ef8dc9cd..d2a1eb4b9f9f 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -4,19 +4,35 @@ struct request; struct task_struct; +struct mmc_blk_request { + struct mmc_request mrq; + struct mmc_command sbc; + struct mmc_command cmd; + struct mmc_command stop; + struct mmc_data data; +}; + +struct mmc_queue_req { + struct request *req; + struct mmc_blk_request brq; + struct scatterlist *sg; + char *bounce_buf; + struct scatterlist *bounce_sg; + unsigned int bounce_sg_len; + struct mmc_async_req mmc_active; +}; + struct mmc_queue { struct mmc_card *card; struct task_struct *thread; struct semaphore thread_sem; unsigned int flags; - struct request *req; int (*issue_fn)(struct mmc_queue *, struct request *); void *data; struct request_queue *queue; - struct scatterlist *sg; - char *bounce_buf; - struct scatterlist *bounce_sg; - unsigned int bounce_sg_len; + struct mmc_queue_req mqrq[2]; + struct mmc_queue_req *mqrq_cur; + struct mmc_queue_req *mqrq_prev; }; extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, @@ -25,8 +41,9 @@ extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); -extern unsigned int mmc_queue_map_sg(struct mmc_queue *); -extern void mmc_queue_bounce_pre(struct mmc_queue *); -extern void mmc_queue_bounce_post(struct mmc_queue *); +extern unsigned int mmc_queue_map_sg(struct mmc_queue *, + struct mmc_queue_req *); +extern void mmc_queue_bounce_pre(struct mmc_queue_req *); +extern void mmc_queue_bounce_post(struct mmc_queue_req *); #endif diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 7843efe22359..f091b43d00c4 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -198,9 +198,109 @@ mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) static void mmc_wait_done(struct mmc_request *mrq) { - complete(mrq->done_data); + complete(&mrq->completion); } +static void __mmc_start_req(struct mmc_host *host, struct mmc_request *mrq) +{ + init_completion(&mrq->completion); + mrq->done = mmc_wait_done; + mmc_start_request(host, mrq); +} + +static void mmc_wait_for_req_done(struct mmc_host *host, + struct mmc_request *mrq) +{ + wait_for_completion(&mrq->completion); +} + +/** + * mmc_pre_req - Prepare for a new request + * @host: MMC host to prepare command + * @mrq: MMC request to prepare for + * @is_first_req: true if there is no previous started request + * that may run in parellel to this call, otherwise false + * + * mmc_pre_req() is called in prior to mmc_start_req() to let + * host prepare for the new request. Preparation of a request may be + * performed while another request is running on the host. + */ +static void mmc_pre_req(struct mmc_host *host, struct mmc_request *mrq, + bool is_first_req) +{ + if (host->ops->pre_req) + host->ops->pre_req(host, mrq, is_first_req); +} + +/** + * mmc_post_req - Post process a completed request + * @host: MMC host to post process command + * @mrq: MMC request to post process for + * @err: Error, if non zero, clean up any resources made in pre_req + * + * Let the host post process a completed request. Post processing of + * a request may be performed while another reuqest is running. + */ +static void mmc_post_req(struct mmc_host *host, struct mmc_request *mrq, + int err) +{ + if (host->ops->post_req) + host->ops->post_req(host, mrq, err); +} + +/** + * mmc_start_req - start a non-blocking request + * @host: MMC host to start command + * @areq: async request to start + * @error: out parameter returns 0 for success, otherwise non zero + * + * Start a new MMC custom command request for a host. + * If there is on ongoing async request wait for completion + * of that request and start the new one and return. + * Does not wait for the new request to complete. + * + * Returns the completed request, NULL in case of none completed. + * Wait for the an ongoing request (previoulsy started) to complete and + * return the completed request. If there is no ongoing request, NULL + * is returned without waiting. NULL is not an error condition. + */ +struct mmc_async_req *mmc_start_req(struct mmc_host *host, + struct mmc_async_req *areq, int *error) +{ + int err = 0; + struct mmc_async_req *data = host->areq; + + /* Prepare a new request */ + if (areq) + mmc_pre_req(host, areq->mrq, !host->areq); + + if (host->areq) { + mmc_wait_for_req_done(host, host->areq->mrq); + err = host->areq->err_check(host->card, host->areq); + if (err) { + mmc_post_req(host, host->areq->mrq, 0); + if (areq) + mmc_post_req(host, areq->mrq, -EINVAL); + + host->areq = NULL; + goto out; + } + } + + if (areq) + __mmc_start_req(host, areq->mrq); + + if (host->areq) + mmc_post_req(host, host->areq->mrq, 0); + + host->areq = areq; + out: + if (error) + *error = err; + return data; +} +EXPORT_SYMBOL(mmc_start_req); + /** * mmc_wait_for_req - start a request and wait for completion * @host: MMC host to start command @@ -212,16 +312,9 @@ static void mmc_wait_done(struct mmc_request *mrq) */ void mmc_wait_for_req(struct mmc_host *host, struct mmc_request *mrq) { - DECLARE_COMPLETION_ONSTACK(complete); - - mrq->done_data = &complete; - mrq->done = mmc_wait_done; - - mmc_start_request(host, mrq); - - wait_for_completion(&complete); + __mmc_start_req(host, mrq); + mmc_wait_for_req_done(host, mrq); } - EXPORT_SYMBOL(mmc_wait_for_req); /** @@ -1516,6 +1609,82 @@ int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from, } EXPORT_SYMBOL(mmc_erase_group_aligned); +static unsigned int mmc_do_calc_max_discard(struct mmc_card *card, + unsigned int arg) +{ + struct mmc_host *host = card->host; + unsigned int max_discard, x, y, qty = 0, max_qty, timeout; + unsigned int last_timeout = 0; + + if (card->erase_shift) + max_qty = UINT_MAX >> card->erase_shift; + else if (mmc_card_sd(card)) + max_qty = UINT_MAX; + else + max_qty = UINT_MAX / card->erase_size; + + /* Find the largest qty with an OK timeout */ + do { + y = 0; + for (x = 1; x && x <= max_qty && max_qty - x >= qty; x <<= 1) { + timeout = mmc_erase_timeout(card, arg, qty + x); + if (timeout > host->max_discard_to) + break; + if (timeout < last_timeout) + break; + last_timeout = timeout; + y = x; + } + qty += y; + } while (y); + + if (!qty) + return 0; + + if (qty == 1) + return 1; + + /* Convert qty to sectors */ + if (card->erase_shift) + max_discard = --qty << card->erase_shift; + else if (mmc_card_sd(card)) + max_discard = qty; + else + max_discard = --qty * card->erase_size; + + return max_discard; +} + +unsigned int mmc_calc_max_discard(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + unsigned int max_discard, max_trim; + + if (!host->max_discard_to) + return UINT_MAX; + + /* + * Without erase_group_def set, MMC erase timeout depends on clock + * frequence which can change. In that case, the best choice is + * just the preferred erase size. + */ + if (mmc_card_mmc(card) && !(card->ext_csd.erase_group_def & 1)) + return card->pref_erase; + + max_discard = mmc_do_calc_max_discard(card, MMC_ERASE_ARG); + if (mmc_can_trim(card)) { + max_trim = mmc_do_calc_max_discard(card, MMC_TRIM_ARG); + if (max_trim < max_discard) + max_discard = max_trim; + } else if (max_discard < card->erase_size) { + max_discard = 0; + } + pr_debug("%s: calculated max. discard sectors %u for timeout %u ms\n", + mmc_hostname(host), max_discard, host->max_discard_to); + return max_discard; +} +EXPORT_SYMBOL(mmc_calc_max_discard); + int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen) { struct mmc_command cmd = {0}; @@ -1663,6 +1832,10 @@ int mmc_power_save_host(struct mmc_host *host) { int ret = 0; +#ifdef CONFIG_MMC_DEBUG + pr_info("%s: %s: powering down\n", mmc_hostname(host), __func__); +#endif + mmc_bus_get(host); if (!host->bus_ops || host->bus_dead || !host->bus_ops->power_restore) { @@ -1685,6 +1858,10 @@ int mmc_power_restore_host(struct mmc_host *host) { int ret; +#ifdef CONFIG_MMC_DEBUG + pr_info("%s: %s: powering up\n", mmc_hostname(host), __func__); +#endif + mmc_bus_get(host); if (!host->bus_ops || host->bus_dead || !host->bus_ops->power_restore) { diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index ff2774128aa9..633975ff2bb3 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -409,52 +409,62 @@ out: static int sd_select_driver_type(struct mmc_card *card, u8 *status) { - int host_drv_type = 0, card_drv_type = 0; + int host_drv_type = SD_DRIVER_TYPE_B; + int card_drv_type = SD_DRIVER_TYPE_B; + int drive_strength; int err; /* * If the host doesn't support any of the Driver Types A,C or D, - * default Driver Type B is used. + * or there is no board specific handler then default Driver + * Type B is used. */ if (!(card->host->caps & (MMC_CAP_DRIVER_TYPE_A | MMC_CAP_DRIVER_TYPE_C | MMC_CAP_DRIVER_TYPE_D))) return 0; - if (card->host->caps & MMC_CAP_DRIVER_TYPE_A) { - host_drv_type = MMC_SET_DRIVER_TYPE_A; - if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_A) - card_drv_type = MMC_SET_DRIVER_TYPE_A; - else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_B) - card_drv_type = MMC_SET_DRIVER_TYPE_B; - else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C) - card_drv_type = MMC_SET_DRIVER_TYPE_C; - } else if (card->host->caps & MMC_CAP_DRIVER_TYPE_C) { - host_drv_type = MMC_SET_DRIVER_TYPE_C; - if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C) - card_drv_type = MMC_SET_DRIVER_TYPE_C; - } else if (!(card->host->caps & MMC_CAP_DRIVER_TYPE_D)) { - /* - * If we are here, that means only the default driver type - * B is supported by the host. - */ - host_drv_type = MMC_SET_DRIVER_TYPE_B; - if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_B) - card_drv_type = MMC_SET_DRIVER_TYPE_B; - else if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C) - card_drv_type = MMC_SET_DRIVER_TYPE_C; - } + if (!card->host->ops->select_drive_strength) + return 0; + + if (card->host->caps & MMC_CAP_DRIVER_TYPE_A) + host_drv_type |= SD_DRIVER_TYPE_A; + + if (card->host->caps & MMC_CAP_DRIVER_TYPE_C) + host_drv_type |= SD_DRIVER_TYPE_C; + + if (card->host->caps & MMC_CAP_DRIVER_TYPE_D) + host_drv_type |= SD_DRIVER_TYPE_D; + + if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_A) + card_drv_type |= SD_DRIVER_TYPE_A; + + if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_C) + card_drv_type |= SD_DRIVER_TYPE_C; + + if (card->sw_caps.sd3_drv_type & SD_DRIVER_TYPE_D) + card_drv_type |= SD_DRIVER_TYPE_D; + + /* + * The drive strength that the hardware can support + * depends on the board design. Pass the appropriate + * information and let the hardware specific code + * return what is possible given the options + */ + drive_strength = card->host->ops->select_drive_strength( + card->sw_caps.uhs_max_dtr, + host_drv_type, card_drv_type); - err = mmc_sd_switch(card, 1, 2, card_drv_type, status); + err = mmc_sd_switch(card, 1, 2, drive_strength, status); if (err) return err; - if ((status[15] & 0xF) != card_drv_type) { - printk(KERN_WARNING "%s: Problem setting driver strength!\n", + if ((status[15] & 0xF) != drive_strength) { + printk(KERN_WARNING "%s: Problem setting drive strength!\n", mmc_hostname(card->host)); return 0; } - mmc_set_driver_type(card->host, host_drv_type); + mmc_set_driver_type(card->host, drive_strength); return 0; } diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index d2565df8a7fb..e4e6822d09e3 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -167,11 +167,8 @@ static int sdio_bus_remove(struct device *dev) int ret = 0; /* Make sure card is powered before invoking ->remove() */ - if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) { - ret = pm_runtime_get_sync(dev); - if (ret < 0) - goto out; - } + if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) + pm_runtime_get_sync(dev); drv->remove(func); @@ -191,7 +188,6 @@ static int sdio_bus_remove(struct device *dev) if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) pm_runtime_put_sync(dev); -out: return ret; } diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 56dbf3f6ad08..8c87096531e9 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -81,28 +81,32 @@ config MMC_RICOH_MMC If unsure, say Y. -config MMC_SDHCI_OF - tristate "SDHCI support on OpenFirmware platforms" - depends on MMC_SDHCI && OF +config MMC_SDHCI_PLTFM + tristate "SDHCI platform and OF driver helper" + depends on MMC_SDHCI help - This selects the OF support for Secure Digital Host Controller - Interfaces. + This selects the common helper functions support for Secure Digital + Host Controller Interface based platform and OF drivers. + + If you have a controller with this interface, say Y or M here. If unsure, say N. config MMC_SDHCI_OF_ESDHC - bool "SDHCI OF support for the Freescale eSDHC controller" - depends on MMC_SDHCI_OF + tristate "SDHCI OF support for the Freescale eSDHC controller" + depends on MMC_SDHCI_PLTFM depends on PPC_OF select MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER help This selects the Freescale eSDHC controller support. + If you have a controller with this interface, say Y or M here. + If unsure, say N. config MMC_SDHCI_OF_HLWD - bool "SDHCI OF support for the Nintendo Wii SDHCI controllers" - depends on MMC_SDHCI_OF + tristate "SDHCI OF support for the Nintendo Wii SDHCI controllers" + depends on MMC_SDHCI_PLTFM depends on PPC_OF select MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER help @@ -110,40 +114,36 @@ config MMC_SDHCI_OF_HLWD found in the "Hollywood" chipset of the Nintendo Wii video game console. - If unsure, say N. - -config MMC_SDHCI_PLTFM - tristate "SDHCI support on the platform specific bus" - depends on MMC_SDHCI - help - This selects the platform specific bus support for Secure Digital Host - Controller Interface. - If you have a controller with this interface, say Y or M here. If unsure, say N. config MMC_SDHCI_CNS3XXX - bool "SDHCI support on the Cavium Networks CNS3xxx SoC" + tristate "SDHCI support on the Cavium Networks CNS3xxx SoC" depends on ARCH_CNS3XXX depends on MMC_SDHCI_PLTFM help This selects the SDHCI support for CNS3xxx System-on-Chip devices. + If you have a controller with this interface, say Y or M here. + If unsure, say N. config MMC_SDHCI_ESDHC_IMX - bool "SDHCI platform support for the Freescale eSDHC i.MX controller" - depends on MMC_SDHCI_PLTFM && (ARCH_MX25 || ARCH_MX35 || ARCH_MX5) + tristate "SDHCI platform support for the Freescale eSDHC i.MX controller" + depends on ARCH_MX25 || ARCH_MX35 || ARCH_MX5 + depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help This selects the Freescale eSDHC controller support on the platform bus, found on platforms like mx35/51. + If you have a controller with this interface, say Y or M here. + If unsure, say N. config MMC_SDHCI_DOVE - bool "SDHCI support on Marvell's Dove SoC" + tristate "SDHCI support on Marvell's Dove SoC" depends on ARCH_DOVE depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS @@ -151,11 +151,14 @@ config MMC_SDHCI_DOVE This selects the Secure Digital Host Controller Interface in Marvell's Dove SoC. + If you have a controller with this interface, say Y or M here. + If unsure, say N. config MMC_SDHCI_TEGRA - bool "SDHCI platform support for the Tegra SD/MMC Controller" - depends on MMC_SDHCI_PLTFM && ARCH_TEGRA + tristate "SDHCI platform support for the Tegra SD/MMC Controller" + depends on ARCH_TEGRA + depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help This selects the Tegra SD/MMC controller. If you have a Tegra @@ -178,14 +181,28 @@ config MMC_SDHCI_S3C If unsure, say N. -config MMC_SDHCI_PXA - tristate "Marvell PXA168/PXA910/MMP2 SD Host Controller support" - depends on ARCH_PXA || ARCH_MMP +config MMC_SDHCI_PXAV3 + tristate "Marvell MMP2 SD Host Controller support (PXAV3)" + depends on CLKDEV_LOOKUP select MMC_SDHCI - select MMC_SDHCI_IO_ACCESSORS + select MMC_SDHCI_PLTFM + default CPU_MMP2 + help + This selects the Marvell(R) PXAV3 SD Host Controller. + If you have a MMP2 platform with SD Host Controller + and a card slot, say Y or M here. + + If unsure, say N. + +config MMC_SDHCI_PXAV2 + tristate "Marvell PXA9XX SD Host Controller support (PXAV2)" + depends on CLKDEV_LOOKUP + select MMC_SDHCI + select MMC_SDHCI_PLTFM + default CPU_PXA910 help - This selects the Marvell(R) PXA168/PXA910/MMP2 SD Host Controller. - If you have a PXA168/PXA910/MMP2 platform with SD Host Controller + This selects the Marvell(R) PXAV2 SD Host Controller. + If you have a PXA9XX platform with SD Host Controller and a card slot, say Y or M here. If unsure, say N. @@ -281,13 +298,12 @@ config MMC_ATMELMCI endchoice config MMC_ATMELMCI_DMA - bool "Atmel MCI DMA support (EXPERIMENTAL)" - depends on MMC_ATMELMCI && (AVR32 || ARCH_AT91SAM9G45) && DMA_ENGINE && EXPERIMENTAL + bool "Atmel MCI DMA support" + depends on MMC_ATMELMCI && (AVR32 || ARCH_AT91SAM9G45) && DMA_ENGINE help Say Y here to have the Atmel MCI driver use a DMA engine to do data transfers and thus increase the throughput and - reduce the CPU utilization. Note that this is highly - experimental and may cause the driver to lock up. + reduce the CPU utilization. If unsure, say N. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 58a5cf73d6e9..b4b83f302e32 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -9,7 +9,8 @@ obj-$(CONFIG_MMC_MXC) += mxcmmc.o obj-$(CONFIG_MMC_MXS) += mxs-mmc.o obj-$(CONFIG_MMC_SDHCI) += sdhci.o obj-$(CONFIG_MMC_SDHCI_PCI) += sdhci-pci.o -obj-$(CONFIG_MMC_SDHCI_PXA) += sdhci-pxa.o +obj-$(CONFIG_MMC_SDHCI_PXAV3) += sdhci-pxav3.o +obj-$(CONFIG_MMC_SDHCI_PXAV2) += sdhci-pxav2.o obj-$(CONFIG_MMC_SDHCI_S3C) += sdhci-s3c.o obj-$(CONFIG_MMC_SDHCI_SPEAR) += sdhci-spear.o obj-$(CONFIG_MMC_WBSD) += wbsd.o @@ -31,9 +32,7 @@ obj-$(CONFIG_MMC_SDRICOH_CS) += sdricoh_cs.o obj-$(CONFIG_MMC_TMIO) += tmio_mmc.o obj-$(CONFIG_MMC_TMIO_CORE) += tmio_mmc_core.o tmio_mmc_core-y := tmio_mmc_pio.o -ifneq ($(CONFIG_MMC_SDHI),n) -tmio_mmc_core-y += tmio_mmc_dma.o -endif +tmio_mmc_core-$(subst m,y,$(CONFIG_MMC_SDHI)) += tmio_mmc_dma.o obj-$(CONFIG_MMC_SDHI) += sh_mobile_sdhi.o obj-$(CONFIG_MMC_CB710) += cb710-mmc.o obj-$(CONFIG_MMC_VIA_SDMMC) += via-sdmmc.o @@ -44,17 +43,13 @@ obj-$(CONFIG_MMC_JZ4740) += jz4740_mmc.o obj-$(CONFIG_MMC_VUB300) += vub300.o obj-$(CONFIG_MMC_USHC) += ushc.o -obj-$(CONFIG_MMC_SDHCI_PLTFM) += sdhci-platform.o -sdhci-platform-y := sdhci-pltfm.o -sdhci-platform-$(CONFIG_MMC_SDHCI_CNS3XXX) += sdhci-cns3xxx.o -sdhci-platform-$(CONFIG_MMC_SDHCI_ESDHC_IMX) += sdhci-esdhc-imx.o -sdhci-platform-$(CONFIG_MMC_SDHCI_DOVE) += sdhci-dove.o -sdhci-platform-$(CONFIG_MMC_SDHCI_TEGRA) += sdhci-tegra.o - -obj-$(CONFIG_MMC_SDHCI_OF) += sdhci-of.o -sdhci-of-y := sdhci-of-core.o -sdhci-of-$(CONFIG_MMC_SDHCI_OF_ESDHC) += sdhci-of-esdhc.o -sdhci-of-$(CONFIG_MMC_SDHCI_OF_HLWD) += sdhci-of-hlwd.o +obj-$(CONFIG_MMC_SDHCI_PLTFM) += sdhci-pltfm.o +obj-$(CONFIG_MMC_SDHCI_CNS3XXX) += sdhci-cns3xxx.o +obj-$(CONFIG_MMC_SDHCI_ESDHC_IMX) += sdhci-esdhc-imx.o +obj-$(CONFIG_MMC_SDHCI_DOVE) += sdhci-dove.o +obj-$(CONFIG_MMC_SDHCI_TEGRA) += sdhci-tegra.o +obj-$(CONFIG_MMC_SDHCI_OF_ESDHC) += sdhci-of-esdhc.o +obj-$(CONFIG_MMC_SDHCI_OF_HLWD) += sdhci-of-hlwd.o ifeq ($(CONFIG_CB710_DEBUG),y) CFLAGS-cb710-mmc += -DDEBUG diff --git a/drivers/mmc/host/at91_mci.c b/drivers/mmc/host/at91_mci.c index d3e6a962f423..a4aa3af86fed 100644 --- a/drivers/mmc/host/at91_mci.c +++ b/drivers/mmc/host/at91_mci.c @@ -77,7 +77,8 @@ #include #include -#include + +#include "at91_mci.h" #define DRIVER_NAME "at91_mci" diff --git a/arch/arm/mach-at91/include/mach/at91_mci.h b/drivers/mmc/host/at91_mci.h similarity index 99% rename from arch/arm/mach-at91/include/mach/at91_mci.h rename to drivers/mmc/host/at91_mci.h index 02182c16a022..eec3a6b1c2bc 100644 --- a/arch/arm/mach-at91/include/mach/at91_mci.h +++ b/drivers/mmc/host/at91_mci.h @@ -1,5 +1,5 @@ /* - * arch/arm/mach-at91/include/mach/at91_mci.h + * drivers/mmc/host/at91_mci.h * * Copyright (C) 2005 Ivan Kokshaysky * Copyright (C) SAN People diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index aa8039f473c4..fa8cae1d7005 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -203,6 +203,7 @@ struct atmel_mci_slot { #define ATMCI_CARD_PRESENT 0 #define ATMCI_CARD_NEED_INIT 1 #define ATMCI_SHUTDOWN 2 +#define ATMCI_SUSPENDED 3 int detect_pin; int wp_pin; @@ -1878,10 +1879,72 @@ static int __exit atmci_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_PM +static int atmci_suspend(struct device *dev) +{ + struct atmel_mci *host = dev_get_drvdata(dev); + int i; + + for (i = 0; i < ATMEL_MCI_MAX_NR_SLOTS; i++) { + struct atmel_mci_slot *slot = host->slot[i]; + int ret; + + if (!slot) + continue; + ret = mmc_suspend_host(slot->mmc); + if (ret < 0) { + while (--i >= 0) { + slot = host->slot[i]; + if (slot + && test_bit(ATMCI_SUSPENDED, &slot->flags)) { + mmc_resume_host(host->slot[i]->mmc); + clear_bit(ATMCI_SUSPENDED, &slot->flags); + } + } + return ret; + } else { + set_bit(ATMCI_SUSPENDED, &slot->flags); + } + } + + return 0; +} + +static int atmci_resume(struct device *dev) +{ + struct atmel_mci *host = dev_get_drvdata(dev); + int i; + int ret = 0; + + for (i = 0; i < ATMEL_MCI_MAX_NR_SLOTS; i++) { + struct atmel_mci_slot *slot = host->slot[i]; + int err; + + slot = host->slot[i]; + if (!slot) + continue; + if (!test_bit(ATMCI_SUSPENDED, &slot->flags)) + continue; + err = mmc_resume_host(slot->mmc); + if (err < 0) + ret = err; + else + clear_bit(ATMCI_SUSPENDED, &slot->flags); + } + + return ret; +} +static SIMPLE_DEV_PM_OPS(atmci_pm, atmci_suspend, atmci_resume); +#define ATMCI_PM_OPS (&atmci_pm) +#else +#define ATMCI_PM_OPS NULL +#endif + static struct platform_driver atmci_driver = { .remove = __exit_p(atmci_remove), .driver = { .name = "atmel_mci", + .pm = ATMCI_PM_OPS, }, }; diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 66dcddb9c205..0c839d3338db 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "dw_mmc.h" @@ -100,6 +101,8 @@ struct dw_mci_slot { int last_detect_state; }; +static struct workqueue_struct *dw_mci_card_workqueue; + #if defined(CONFIG_DEBUG_FS) static int dw_mci_req_show(struct seq_file *s, void *v) { @@ -284,7 +287,7 @@ static void send_stop_cmd(struct dw_mci *host, struct mmc_data *data) /* DMA interface functions */ static void dw_mci_stop_dma(struct dw_mci *host) { - if (host->use_dma) { + if (host->using_dma) { host->dma_ops->stop(host); host->dma_ops->cleanup(host); } else { @@ -432,6 +435,8 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data) unsigned int i, direction, sg_len; u32 temp; + host->using_dma = 0; + /* If we don't have a channel, we can't do DMA */ if (!host->use_dma) return -ENODEV; @@ -451,6 +456,8 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data) return -EINVAL; } + host->using_dma = 1; + if (data->flags & MMC_DATA_READ) direction = DMA_FROM_DEVICE; else @@ -489,14 +496,18 @@ static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data) host->sg = NULL; host->data = data; + if (data->flags & MMC_DATA_READ) + host->dir_status = DW_MCI_RECV_STATUS; + else + host->dir_status = DW_MCI_SEND_STATUS; + if (dw_mci_submit_data_dma(host, data)) { host->sg = data->sg; host->pio_offset = 0; - if (data->flags & MMC_DATA_READ) - host->dir_status = DW_MCI_RECV_STATUS; - else - host->dir_status = DW_MCI_SEND_STATUS; + host->part_buf_start = 0; + host->part_buf_count = 0; + mci_writel(host, RINTSTS, SDMMC_INT_TXDR | SDMMC_INT_RXDR); temp = mci_readl(host, INTMASK); temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR; mci_writel(host, INTMASK, temp); @@ -574,7 +585,7 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot) } /* Set the current slot bus width */ - mci_writel(host, CTYPE, slot->ctype); + mci_writel(host, CTYPE, (slot->ctype << slot->id)); } static void dw_mci_start_request(struct dw_mci *host, @@ -624,13 +635,13 @@ static void dw_mci_start_request(struct dw_mci *host, host->stop_cmdr = dw_mci_prepare_command(slot->mmc, mrq->stop); } +/* must be called with host->lock held */ static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot, struct mmc_request *mrq) { dev_vdbg(&slot->mmc->class_dev, "queue request: state=%d\n", host->state); - spin_lock_bh(&host->lock); slot->mrq = mrq; if (host->state == STATE_IDLE) { @@ -639,8 +650,6 @@ static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot, } else { list_add_tail(&slot->queue_node, &host->queue); } - - spin_unlock_bh(&host->lock); } static void dw_mci_request(struct mmc_host *mmc, struct mmc_request *mrq) @@ -650,14 +659,23 @@ static void dw_mci_request(struct mmc_host *mmc, struct mmc_request *mrq) WARN_ON(slot->mrq); + /* + * The check for card presence and queueing of the request must be + * atomic, otherwise the card could be removed in between and the + * request wouldn't fail until another card was inserted. + */ + spin_lock_bh(&host->lock); + if (!test_bit(DW_MMC_CARD_PRESENT, &slot->flags)) { + spin_unlock_bh(&host->lock); mrq->cmd->error = -ENOMEDIUM; mmc_request_done(mmc, mrq); return; } - /* We don't support multiple blocks of weird lengths. */ dw_mci_queue_request(host, slot, mrq); + + spin_unlock_bh(&host->lock); } static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) @@ -831,7 +849,7 @@ static void dw_mci_tasklet_func(unsigned long priv) struct mmc_command *cmd; enum dw_mci_state state; enum dw_mci_state prev_state; - u32 status; + u32 status, ctrl; spin_lock(&host->lock); @@ -891,13 +909,19 @@ static void dw_mci_tasklet_func(unsigned long priv) if (status & DW_MCI_DATA_ERROR_FLAGS) { if (status & SDMMC_INT_DTO) { - dev_err(&host->pdev->dev, - "data timeout error\n"); data->error = -ETIMEDOUT; } else if (status & SDMMC_INT_DCRC) { - dev_err(&host->pdev->dev, - "data CRC error\n"); data->error = -EILSEQ; + } else if (status & SDMMC_INT_EBE && + host->dir_status == + DW_MCI_SEND_STATUS) { + /* + * No data CRC status was returned. + * The number of bytes transferred will + * be exaggerated in PIO mode. + */ + data->bytes_xfered = 0; + data->error = -ETIMEDOUT; } else { dev_err(&host->pdev->dev, "data FIFO error " @@ -905,6 +929,16 @@ static void dw_mci_tasklet_func(unsigned long priv) status); data->error = -EIO; } + /* + * After an error, there may be data lingering + * in the FIFO, so reset it - doing so + * generates a block interrupt, hence setting + * the scatter-gather pointer to NULL. + */ + host->sg = NULL; + ctrl = mci_readl(host, CTRL); + ctrl |= SDMMC_CTRL_FIFO_RESET; + mci_writel(host, CTRL, ctrl); } else { data->bytes_xfered = data->blocks * data->blksz; data->error = 0; @@ -946,84 +980,278 @@ unlock: } -static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt) +/* push final bytes to part_buf, only use during push */ +static void dw_mci_set_part_bytes(struct dw_mci *host, void *buf, int cnt) { - u16 *pdata = (u16 *)buf; + memcpy((void *)&host->part_buf, buf, cnt); + host->part_buf_count = cnt; +} - WARN_ON(cnt % 2 != 0); +/* append bytes to part_buf, only use during push */ +static int dw_mci_push_part_bytes(struct dw_mci *host, void *buf, int cnt) +{ + cnt = min(cnt, (1 << host->data_shift) - host->part_buf_count); + memcpy((void *)&host->part_buf + host->part_buf_count, buf, cnt); + host->part_buf_count += cnt; + return cnt; +} - cnt = cnt >> 1; - while (cnt > 0) { - mci_writew(host, DATA, *pdata++); - cnt--; +/* pull first bytes from part_buf, only use during pull */ +static int dw_mci_pull_part_bytes(struct dw_mci *host, void *buf, int cnt) +{ + cnt = min(cnt, (int)host->part_buf_count); + if (cnt) { + memcpy(buf, (void *)&host->part_buf + host->part_buf_start, + cnt); + host->part_buf_count -= cnt; + host->part_buf_start += cnt; } + return cnt; } -static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt) +/* pull final bytes from the part_buf, assuming it's just been filled */ +static void dw_mci_pull_final_bytes(struct dw_mci *host, void *buf, int cnt) { - u16 *pdata = (u16 *)buf; + memcpy(buf, &host->part_buf, cnt); + host->part_buf_start = cnt; + host->part_buf_count = (1 << host->data_shift) - cnt; +} - WARN_ON(cnt % 2 != 0); +static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt) +{ + /* try and push anything in the part_buf */ + if (unlikely(host->part_buf_count)) { + int len = dw_mci_push_part_bytes(host, buf, cnt); + buf += len; + cnt -= len; + if (!sg_next(host->sg) || host->part_buf_count == 2) { + mci_writew(host, DATA, host->part_buf16); + host->part_buf_count = 0; + } + } +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x1)) { + while (cnt >= 2) { + u16 aligned_buf[64]; + int len = min(cnt & -2, (int)sizeof(aligned_buf)); + int items = len >> 1; + int i; + /* memcpy from input buffer into aligned buffer */ + memcpy(aligned_buf, buf, len); + buf += len; + cnt -= len; + /* push data from aligned buffer into fifo */ + for (i = 0; i < items; ++i) + mci_writew(host, DATA, aligned_buf[i]); + } + } else +#endif + { + u16 *pdata = buf; + for (; cnt >= 2; cnt -= 2) + mci_writew(host, DATA, *pdata++); + buf = pdata; + } + /* put anything remaining in the part_buf */ + if (cnt) { + dw_mci_set_part_bytes(host, buf, cnt); + if (!sg_next(host->sg)) + mci_writew(host, DATA, host->part_buf16); + } +} - cnt = cnt >> 1; - while (cnt > 0) { - *pdata++ = mci_readw(host, DATA); - cnt--; +static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt) +{ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x1)) { + while (cnt >= 2) { + /* pull data from fifo into aligned buffer */ + u16 aligned_buf[64]; + int len = min(cnt & -2, (int)sizeof(aligned_buf)); + int items = len >> 1; + int i; + for (i = 0; i < items; ++i) + aligned_buf[i] = mci_readw(host, DATA); + /* memcpy from aligned buffer into output buffer */ + memcpy(buf, aligned_buf, len); + buf += len; + cnt -= len; + } + } else +#endif + { + u16 *pdata = buf; + for (; cnt >= 2; cnt -= 2) + *pdata++ = mci_readw(host, DATA); + buf = pdata; + } + if (cnt) { + host->part_buf16 = mci_readw(host, DATA); + dw_mci_pull_final_bytes(host, buf, cnt); } } static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt) { - u32 *pdata = (u32 *)buf; - - WARN_ON(cnt % 4 != 0); - WARN_ON((unsigned long)pdata & 0x3); - - cnt = cnt >> 2; - while (cnt > 0) { - mci_writel(host, DATA, *pdata++); - cnt--; + /* try and push anything in the part_buf */ + if (unlikely(host->part_buf_count)) { + int len = dw_mci_push_part_bytes(host, buf, cnt); + buf += len; + cnt -= len; + if (!sg_next(host->sg) || host->part_buf_count == 4) { + mci_writel(host, DATA, host->part_buf32); + host->part_buf_count = 0; + } + } +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x3)) { + while (cnt >= 4) { + u32 aligned_buf[32]; + int len = min(cnt & -4, (int)sizeof(aligned_buf)); + int items = len >> 2; + int i; + /* memcpy from input buffer into aligned buffer */ + memcpy(aligned_buf, buf, len); + buf += len; + cnt -= len; + /* push data from aligned buffer into fifo */ + for (i = 0; i < items; ++i) + mci_writel(host, DATA, aligned_buf[i]); + } + } else +#endif + { + u32 *pdata = buf; + for (; cnt >= 4; cnt -= 4) + mci_writel(host, DATA, *pdata++); + buf = pdata; + } + /* put anything remaining in the part_buf */ + if (cnt) { + dw_mci_set_part_bytes(host, buf, cnt); + if (!sg_next(host->sg)) + mci_writel(host, DATA, host->part_buf32); } } static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt) { - u32 *pdata = (u32 *)buf; - - WARN_ON(cnt % 4 != 0); - WARN_ON((unsigned long)pdata & 0x3); - - cnt = cnt >> 2; - while (cnt > 0) { - *pdata++ = mci_readl(host, DATA); - cnt--; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x3)) { + while (cnt >= 4) { + /* pull data from fifo into aligned buffer */ + u32 aligned_buf[32]; + int len = min(cnt & -4, (int)sizeof(aligned_buf)); + int items = len >> 2; + int i; + for (i = 0; i < items; ++i) + aligned_buf[i] = mci_readl(host, DATA); + /* memcpy from aligned buffer into output buffer */ + memcpy(buf, aligned_buf, len); + buf += len; + cnt -= len; + } + } else +#endif + { + u32 *pdata = buf; + for (; cnt >= 4; cnt -= 4) + *pdata++ = mci_readl(host, DATA); + buf = pdata; + } + if (cnt) { + host->part_buf32 = mci_readl(host, DATA); + dw_mci_pull_final_bytes(host, buf, cnt); } } static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt) { - u64 *pdata = (u64 *)buf; - - WARN_ON(cnt % 8 != 0); - - cnt = cnt >> 3; - while (cnt > 0) { - mci_writeq(host, DATA, *pdata++); - cnt--; + /* try and push anything in the part_buf */ + if (unlikely(host->part_buf_count)) { + int len = dw_mci_push_part_bytes(host, buf, cnt); + buf += len; + cnt -= len; + if (!sg_next(host->sg) || host->part_buf_count == 8) { + mci_writew(host, DATA, host->part_buf); + host->part_buf_count = 0; + } + } +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x7)) { + while (cnt >= 8) { + u64 aligned_buf[16]; + int len = min(cnt & -8, (int)sizeof(aligned_buf)); + int items = len >> 3; + int i; + /* memcpy from input buffer into aligned buffer */ + memcpy(aligned_buf, buf, len); + buf += len; + cnt -= len; + /* push data from aligned buffer into fifo */ + for (i = 0; i < items; ++i) + mci_writeq(host, DATA, aligned_buf[i]); + } + } else +#endif + { + u64 *pdata = buf; + for (; cnt >= 8; cnt -= 8) + mci_writeq(host, DATA, *pdata++); + buf = pdata; + } + /* put anything remaining in the part_buf */ + if (cnt) { + dw_mci_set_part_bytes(host, buf, cnt); + if (!sg_next(host->sg)) + mci_writeq(host, DATA, host->part_buf); } } static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt) { - u64 *pdata = (u64 *)buf; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (unlikely((unsigned long)buf & 0x7)) { + while (cnt >= 8) { + /* pull data from fifo into aligned buffer */ + u64 aligned_buf[16]; + int len = min(cnt & -8, (int)sizeof(aligned_buf)); + int items = len >> 3; + int i; + for (i = 0; i < items; ++i) + aligned_buf[i] = mci_readq(host, DATA); + /* memcpy from aligned buffer into output buffer */ + memcpy(buf, aligned_buf, len); + buf += len; + cnt -= len; + } + } else +#endif + { + u64 *pdata = buf; + for (; cnt >= 8; cnt -= 8) + *pdata++ = mci_readq(host, DATA); + buf = pdata; + } + if (cnt) { + host->part_buf = mci_readq(host, DATA); + dw_mci_pull_final_bytes(host, buf, cnt); + } +} - WARN_ON(cnt % 8 != 0); +static void dw_mci_pull_data(struct dw_mci *host, void *buf, int cnt) +{ + int len; - cnt = cnt >> 3; - while (cnt > 0) { - *pdata++ = mci_readq(host, DATA); - cnt--; - } + /* get remaining partial bytes */ + len = dw_mci_pull_part_bytes(host, buf, cnt); + if (unlikely(len == cnt)) + return; + buf += len; + cnt -= len; + + /* get the rest of the data */ + host->pull_data(host, buf, cnt); } static void dw_mci_read_data_pio(struct dw_mci *host) @@ -1037,9 +1265,10 @@ static void dw_mci_read_data_pio(struct dw_mci *host) unsigned int nbytes = 0, len; do { - len = SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift; + len = host->part_buf_count + + (SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift); if (offset + len <= sg->length) { - host->pull_data(host, (void *)(buf + offset), len); + dw_mci_pull_data(host, (void *)(buf + offset), len); offset += len; nbytes += len; @@ -1055,8 +1284,8 @@ static void dw_mci_read_data_pio(struct dw_mci *host) } } else { unsigned int remaining = sg->length - offset; - host->pull_data(host, (void *)(buf + offset), - remaining); + dw_mci_pull_data(host, (void *)(buf + offset), + remaining); nbytes += remaining; flush_dcache_page(sg_page(sg)); @@ -1066,7 +1295,7 @@ static void dw_mci_read_data_pio(struct dw_mci *host) offset = len - remaining; buf = sg_virt(sg); - host->pull_data(host, buf, offset); + dw_mci_pull_data(host, buf, offset); nbytes += offset; } @@ -1083,7 +1312,6 @@ static void dw_mci_read_data_pio(struct dw_mci *host) return; } } while (status & SDMMC_INT_RXDR); /*if the RXDR is ready read again*/ - len = SDMMC_GET_FCNT(mci_readl(host, STATUS)); host->pio_offset = offset; data->bytes_xfered += nbytes; return; @@ -1105,8 +1333,9 @@ static void dw_mci_write_data_pio(struct dw_mci *host) unsigned int nbytes = 0, len; do { - len = SDMMC_FIFO_SZ - - (SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift); + len = ((host->fifo_depth - + SDMMC_GET_FCNT(mci_readl(host, STATUS))) << shift) + - host->part_buf_count; if (offset + len <= sg->length) { host->push_data(host, (void *)(buf + offset), len); @@ -1151,10 +1380,8 @@ static void dw_mci_write_data_pio(struct dw_mci *host) return; } } while (status & SDMMC_INT_TXDR); /* if TXDR write again */ - host->pio_offset = offset; data->bytes_xfered += nbytes; - return; done: @@ -1202,7 +1429,6 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) host->cmd_status = status; smp_wmb(); set_bit(EVENT_CMD_COMPLETE, &host->pending_events); - tasklet_schedule(&host->tasklet); } if (pending & DW_MCI_DATA_ERROR_FLAGS) { @@ -1211,7 +1437,9 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) host->data_status = status; smp_wmb(); set_bit(EVENT_DATA_ERROR, &host->pending_events); - tasklet_schedule(&host->tasklet); + if (!(pending & (SDMMC_INT_DTO | SDMMC_INT_DCRC | + SDMMC_INT_SBE | SDMMC_INT_EBE))) + tasklet_schedule(&host->tasklet); } if (pending & SDMMC_INT_DATA_OVER) { @@ -1229,13 +1457,13 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) if (pending & SDMMC_INT_RXDR) { mci_writel(host, RINTSTS, SDMMC_INT_RXDR); - if (host->sg) + if (host->dir_status == DW_MCI_RECV_STATUS && host->sg) dw_mci_read_data_pio(host); } if (pending & SDMMC_INT_TXDR) { mci_writel(host, RINTSTS, SDMMC_INT_TXDR); - if (host->sg) + if (host->dir_status == DW_MCI_SEND_STATUS && host->sg) dw_mci_write_data_pio(host); } @@ -1246,7 +1474,7 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) if (pending & SDMMC_INT_CD) { mci_writel(host, RINTSTS, SDMMC_INT_CD); - tasklet_schedule(&host->card_tasklet); + queue_work(dw_mci_card_workqueue, &host->card_work); } } while (pass_count++ < 5); @@ -1265,9 +1493,9 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void dw_mci_tasklet_card(unsigned long data) +static void dw_mci_work_routine_card(struct work_struct *work) { - struct dw_mci *host = (struct dw_mci *)data; + struct dw_mci *host = container_of(work, struct dw_mci, card_work); int i; for (i = 0; i < host->num_slots; i++) { @@ -1279,22 +1507,21 @@ static void dw_mci_tasklet_card(unsigned long data) present = dw_mci_get_cd(mmc); while (present != slot->last_detect_state) { - spin_lock(&host->lock); - dev_dbg(&slot->mmc->class_dev, "card %s\n", present ? "inserted" : "removed"); + /* Power up slot (before spin_lock, may sleep) */ + if (present != 0 && host->pdata->setpower) + host->pdata->setpower(slot->id, mmc->ocr_avail); + + spin_lock_bh(&host->lock); + /* Card change detected */ slot->last_detect_state = present; - /* Power up slot */ - if (present != 0) { - if (host->pdata->setpower) - host->pdata->setpower(slot->id, - mmc->ocr_avail); - + /* Mark card as present if applicable */ + if (present != 0) set_bit(DW_MMC_CARD_PRESENT, &slot->flags); - } /* Clean up queue if present */ mrq = slot->mrq; @@ -1344,8 +1571,6 @@ static void dw_mci_tasklet_card(unsigned long data) /* Power down slot */ if (present == 0) { - if (host->pdata->setpower) - host->pdata->setpower(slot->id, 0); clear_bit(DW_MMC_CARD_PRESENT, &slot->flags); /* @@ -1367,7 +1592,12 @@ static void dw_mci_tasklet_card(unsigned long data) } - spin_unlock(&host->lock); + spin_unlock_bh(&host->lock); + + /* Power down slot (after spin_unlock, may sleep) */ + if (present == 0 && host->pdata->setpower) + host->pdata->setpower(slot->id, 0); + present = dw_mci_get_cd(mmc); } @@ -1467,7 +1697,7 @@ static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id) * Card may have been plugged in prior to boot so we * need to run the detect tasklet */ - tasklet_schedule(&host->card_tasklet); + queue_work(dw_mci_card_workqueue, &host->card_work); return 0; } @@ -1645,8 +1875,19 @@ static int dw_mci_probe(struct platform_device *pdev) * FIFO threshold settings RxMark = fifo_size / 2 - 1, * Tx Mark = fifo_size / 2 DMA Size = 8 */ - fifo_size = mci_readl(host, FIFOTH); - fifo_size = (fifo_size >> 16) & 0x7ff; + if (!host->pdata->fifo_depth) { + /* + * Power-on value of RX_WMark is FIFO_DEPTH-1, but this may + * have been overwritten by the bootloader, just like we're + * about to do, so if you know the value for your hardware, you + * should put it in the platform data. + */ + fifo_size = mci_readl(host, FIFOTH); + fifo_size = 1 + ((fifo_size >> 16) & 0x7ff); + } else { + fifo_size = host->pdata->fifo_depth; + } + host->fifo_depth = fifo_size; host->fifoth_val = ((0x2 << 28) | ((fifo_size/2 - 1) << 16) | ((fifo_size/2) << 0)); mci_writel(host, FIFOTH, host->fifoth_val); @@ -1656,12 +1897,15 @@ static int dw_mci_probe(struct platform_device *pdev) mci_writel(host, CLKSRC, 0); tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host); - tasklet_init(&host->card_tasklet, - dw_mci_tasklet_card, (unsigned long)host); + dw_mci_card_workqueue = alloc_workqueue("dw-mci-card", + WQ_MEM_RECLAIM | WQ_NON_REENTRANT, 1); + if (!dw_mci_card_workqueue) + goto err_dmaunmap; + INIT_WORK(&host->card_work, dw_mci_work_routine_card); ret = request_irq(irq, dw_mci_interrupt, 0, "dw-mci", host); if (ret) - goto err_dmaunmap; + goto err_workqueue; platform_set_drvdata(pdev, host); @@ -1690,7 +1934,9 @@ static int dw_mci_probe(struct platform_device *pdev) mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */ dev_info(&pdev->dev, "DW MMC controller at irq %d, " - "%d bit host data width\n", irq, width); + "%d bit host data width, " + "%u deep fifo\n", + irq, width, fifo_size); if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO) dev_info(&pdev->dev, "Internal DMAC interrupt fix enabled.\n"); @@ -1705,6 +1951,9 @@ err_init_slot: } free_irq(irq, host); +err_workqueue: + destroy_workqueue(dw_mci_card_workqueue); + err_dmaunmap: if (host->use_dma && host->dma_ops->exit) host->dma_ops->exit(host); @@ -1744,6 +1993,7 @@ static int __exit dw_mci_remove(struct platform_device *pdev) mci_writel(host, CLKSRC, 0); free_irq(platform_get_irq(pdev, 0), host); + destroy_workqueue(dw_mci_card_workqueue); dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma); if (host->use_dma && host->dma_ops->exit) diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 23c662af5616..027d37735394 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -118,7 +118,6 @@ #define SDMMC_CMD_INDX(n) ((n) & 0x1F) /* Status register defines */ #define SDMMC_GET_FCNT(x) (((x)>>17) & 0x1FF) -#define SDMMC_FIFO_SZ 32 /* Internal DMAC interrupt defines */ #define SDMMC_IDMAC_INT_AI BIT(9) #define SDMMC_IDMAC_INT_NI BIT(8) @@ -134,22 +133,22 @@ /* Register access macros */ #define mci_readl(dev, reg) \ - __raw_readl(dev->regs + SDMMC_##reg) + __raw_readl((dev)->regs + SDMMC_##reg) #define mci_writel(dev, reg, value) \ - __raw_writel((value), dev->regs + SDMMC_##reg) + __raw_writel((value), (dev)->regs + SDMMC_##reg) /* 16-bit FIFO access macros */ #define mci_readw(dev, reg) \ - __raw_readw(dev->regs + SDMMC_##reg) + __raw_readw((dev)->regs + SDMMC_##reg) #define mci_writew(dev, reg, value) \ - __raw_writew((value), dev->regs + SDMMC_##reg) + __raw_writew((value), (dev)->regs + SDMMC_##reg) /* 64-bit FIFO access macros */ #ifdef readq #define mci_readq(dev, reg) \ - __raw_readq(dev->regs + SDMMC_##reg) + __raw_readq((dev)->regs + SDMMC_##reg) #define mci_writeq(dev, reg, value) \ - __raw_writeq((value), dev->regs + SDMMC_##reg) + __raw_writeq((value), (dev)->regs + SDMMC_##reg) #else /* * Dummy readq implementation for architectures that don't define it. @@ -160,9 +159,9 @@ * rest of the code free from ifdefs. */ #define mci_readq(dev, reg) \ - (*(volatile u64 __force *)(dev->regs + SDMMC_##reg)) + (*(volatile u64 __force *)((dev)->regs + SDMMC_##reg)) #define mci_writeq(dev, reg, value) \ - (*(volatile u64 __force *)(dev->regs + SDMMC_##reg) = value) + (*(volatile u64 __force *)((dev)->regs + SDMMC_##reg) = (value)) #endif #endif /* _DW_MMC_H_ */ diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index fe140724a02e..fef7140eb1d0 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -226,6 +226,9 @@ static void __devinit mmci_dma_setup(struct mmci_host *host) return; } + /* initialize pre request cookie */ + host->next_data.cookie = 1; + /* Try to acquire a generic DMA engine slave channel */ dma_cap_zero(mask); dma_cap_set(DMA_SLAVE, mask); @@ -335,7 +338,8 @@ static void mmci_dma_unmap(struct mmci_host *host, struct mmc_data *data) dir = DMA_FROM_DEVICE; } - dma_unmap_sg(chan->device->dev, data->sg, data->sg_len, dir); + if (!data->host_cookie) + dma_unmap_sg(chan->device->dev, data->sg, data->sg_len, dir); /* * Use of DMA with scatter-gather is impossible. @@ -353,7 +357,8 @@ static void mmci_dma_data_error(struct mmci_host *host) dmaengine_terminate_all(host->dma_current); } -static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) +static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data, + struct mmci_host_next *next) { struct variant_data *variant = host->variant; struct dma_slave_config conf = { @@ -364,13 +369,20 @@ static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) .src_maxburst = variant->fifohalfsize >> 2, /* # of words */ .dst_maxburst = variant->fifohalfsize >> 2, /* # of words */ }; - struct mmc_data *data = host->data; struct dma_chan *chan; struct dma_device *device; struct dma_async_tx_descriptor *desc; int nr_sg; - host->dma_current = NULL; + /* Check if next job is already prepared */ + if (data->host_cookie && !next && + host->dma_current && host->dma_desc_current) + return 0; + + if (!next) { + host->dma_current = NULL; + host->dma_desc_current = NULL; + } if (data->flags & MMC_DATA_READ) { conf.direction = DMA_FROM_DEVICE; @@ -385,7 +397,7 @@ static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) return -EINVAL; /* If less than or equal to the fifo size, don't bother with DMA */ - if (host->size <= variant->fifosize) + if (data->blksz * data->blocks <= variant->fifosize) return -EINVAL; device = chan->device; @@ -399,14 +411,38 @@ static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) if (!desc) goto unmap_exit; - /* Okay, go for it. */ - host->dma_current = chan; + if (next) { + next->dma_chan = chan; + next->dma_desc = desc; + } else { + host->dma_current = chan; + host->dma_desc_current = desc; + } + + return 0; + unmap_exit: + if (!next) + dmaengine_terminate_all(chan); + dma_unmap_sg(device->dev, data->sg, data->sg_len, conf.direction); + return -ENOMEM; +} + +static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) +{ + int ret; + struct mmc_data *data = host->data; + + ret = mmci_dma_prep_data(host, host->data, NULL); + if (ret) + return ret; + + /* Okay, go for it. */ dev_vdbg(mmc_dev(host->mmc), "Submit MMCI DMA job, sglen %d blksz %04x blks %04x flags %08x\n", data->sg_len, data->blksz, data->blocks, data->flags); - dmaengine_submit(desc); - dma_async_issue_pending(chan); + dmaengine_submit(host->dma_desc_current); + dma_async_issue_pending(host->dma_current); datactrl |= MCI_DPSM_DMAENABLE; @@ -421,14 +457,90 @@ static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl) writel(readl(host->base + MMCIMASK0) | MCI_DATAENDMASK, host->base + MMCIMASK0); return 0; +} -unmap_exit: - dmaengine_terminate_all(chan); - dma_unmap_sg(device->dev, data->sg, data->sg_len, conf.direction); - return -ENOMEM; +static void mmci_get_next_data(struct mmci_host *host, struct mmc_data *data) +{ + struct mmci_host_next *next = &host->next_data; + + if (data->host_cookie && data->host_cookie != next->cookie) { + printk(KERN_WARNING "[%s] invalid cookie: data->host_cookie %d" + " host->next_data.cookie %d\n", + __func__, data->host_cookie, host->next_data.cookie); + data->host_cookie = 0; + } + + if (!data->host_cookie) + return; + + host->dma_desc_current = next->dma_desc; + host->dma_current = next->dma_chan; + + next->dma_desc = NULL; + next->dma_chan = NULL; } + +static void mmci_pre_request(struct mmc_host *mmc, struct mmc_request *mrq, + bool is_first_req) +{ + struct mmci_host *host = mmc_priv(mmc); + struct mmc_data *data = mrq->data; + struct mmci_host_next *nd = &host->next_data; + + if (!data) + return; + + if (data->host_cookie) { + data->host_cookie = 0; + return; + } + + /* if config for dma */ + if (((data->flags & MMC_DATA_WRITE) && host->dma_tx_channel) || + ((data->flags & MMC_DATA_READ) && host->dma_rx_channel)) { + if (mmci_dma_prep_data(host, data, nd)) + data->host_cookie = 0; + else + data->host_cookie = ++nd->cookie < 0 ? 1 : nd->cookie; + } +} + +static void mmci_post_request(struct mmc_host *mmc, struct mmc_request *mrq, + int err) +{ + struct mmci_host *host = mmc_priv(mmc); + struct mmc_data *data = mrq->data; + struct dma_chan *chan; + enum dma_data_direction dir; + + if (!data) + return; + + if (data->flags & MMC_DATA_READ) { + dir = DMA_FROM_DEVICE; + chan = host->dma_rx_channel; + } else { + dir = DMA_TO_DEVICE; + chan = host->dma_tx_channel; + } + + + /* if config for dma */ + if (chan) { + if (err) + dmaengine_terminate_all(chan); + if (err || data->host_cookie) + dma_unmap_sg(mmc_dev(host->mmc), data->sg, + data->sg_len, dir); + mrq->data->host_cookie = 0; + } +} + #else /* Blank functions if the DMA engine is not available */ +static void mmci_get_next_data(struct mmci_host *host, struct mmc_data *data) +{ +} static inline void mmci_dma_setup(struct mmci_host *host) { } @@ -449,6 +561,10 @@ static inline int mmci_dma_start_data(struct mmci_host *host, unsigned int datac { return -ENOSYS; } + +#define mmci_pre_request NULL +#define mmci_post_request NULL + #endif static void mmci_start_data(struct mmci_host *host, struct mmc_data *data) @@ -872,6 +988,9 @@ static void mmci_request(struct mmc_host *mmc, struct mmc_request *mrq) host->mrq = mrq; + if (mrq->data) + mmci_get_next_data(host, mrq->data); + if (mrq->data && mrq->data->flags & MMC_DATA_READ) mmci_start_data(host, mrq->data); @@ -986,6 +1105,8 @@ static irqreturn_t mmci_cd_irq(int irq, void *dev_id) static const struct mmc_host_ops mmci_ops = { .request = mmci_request, + .pre_req = mmci_pre_request, + .post_req = mmci_post_request, .set_ios = mmci_set_ios, .get_ro = mmci_get_ro, .get_cd = mmci_get_cd, diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h index 2164e8c6476c..79e4143ab9df 100644 --- a/drivers/mmc/host/mmci.h +++ b/drivers/mmc/host/mmci.h @@ -166,6 +166,12 @@ struct clk; struct variant_data; struct dma_chan; +struct mmci_host_next { + struct dma_async_tx_descriptor *dma_desc; + struct dma_chan *dma_chan; + s32 cookie; +}; + struct mmci_host { phys_addr_t phybase; void __iomem *base; @@ -203,6 +209,8 @@ struct mmci_host { struct dma_chan *dma_current; struct dma_chan *dma_rx_channel; struct dma_chan *dma_tx_channel; + struct dma_async_tx_descriptor *dma_desc_current; + struct mmci_host_next next_data; #define dma_inprogress(host) ((host)->dma_current) #else diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index 99d39a6a1032..d513d47364d0 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -564,40 +564,38 @@ static void mxs_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) static void mxs_mmc_set_clk_rate(struct mxs_mmc_host *host, unsigned int rate) { - unsigned int ssp_rate, bit_rate; - u32 div1, div2; + unsigned int ssp_clk, ssp_sck; + u32 clock_divide, clock_rate; u32 val; - ssp_rate = clk_get_rate(host->clk); + ssp_clk = clk_get_rate(host->clk); - for (div1 = 2; div1 < 254; div1 += 2) { - div2 = ssp_rate / rate / div1; - if (div2 < 0x100) + for (clock_divide = 2; clock_divide <= 254; clock_divide += 2) { + clock_rate = DIV_ROUND_UP(ssp_clk, rate * clock_divide); + clock_rate = (clock_rate > 0) ? clock_rate - 1 : 0; + if (clock_rate <= 255) break; } - if (div1 >= 254) { + if (clock_divide > 254) { dev_err(mmc_dev(host->mmc), "%s: cannot set clock to %d\n", __func__, rate); return; } - if (div2 == 0) - bit_rate = ssp_rate / div1; - else - bit_rate = ssp_rate / div1 / div2; + ssp_sck = ssp_clk / clock_divide / (1 + clock_rate); val = readl(host->base + HW_SSP_TIMING); val &= ~(BM_SSP_TIMING_CLOCK_DIVIDE | BM_SSP_TIMING_CLOCK_RATE); - val |= BF_SSP(div1, TIMING_CLOCK_DIVIDE); - val |= BF_SSP(div2 - 1, TIMING_CLOCK_RATE); + val |= BF_SSP(clock_divide, TIMING_CLOCK_DIVIDE); + val |= BF_SSP(clock_rate, TIMING_CLOCK_RATE); writel(val, host->base + HW_SSP_TIMING); - host->clk_rate = bit_rate; + host->clk_rate = ssp_sck; dev_dbg(mmc_dev(host->mmc), - "%s: div1 %d, div2 %d, ssp %d, bit %d, rate %d\n", - __func__, div1, div2, ssp_rate, bit_rate, rate); + "%s: clock_divide %d, clock_rate %d, ssp_clk %d, rate_actual %d, rate_requested %d\n", + __func__, clock_divide, clock_rate, ssp_clk, ssp_sck, rate); } static void mxs_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index dedf3dab8a3b..21e4a799df48 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -116,15 +118,13 @@ #define OMAP_MMC4_DEVID 3 #define OMAP_MMC5_DEVID 4 +#define MMC_AUTOSUSPEND_DELAY 100 #define MMC_TIMEOUT_MS 20 #define OMAP_MMC_MASTER_CLOCK 96000000 +#define OMAP_MMC_MIN_CLOCK 400000 +#define OMAP_MMC_MAX_CLOCK 52000000 #define DRIVER_NAME "omap_hsmmc" -/* Timeouts for entering power saving states on inactivity, msec */ -#define OMAP_MMC_DISABLED_TIMEOUT 100 -#define OMAP_MMC_SLEEP_TIMEOUT 1000 -#define OMAP_MMC_OFF_TIMEOUT 8000 - /* * One controller can have multiple slots, like on some omap boards using * omap.c controller driver. Luckily this is not currently done on any known @@ -141,6 +141,11 @@ #define OMAP_HSMMC_WRITE(base, reg, val) \ __raw_writel((val), (base) + OMAP_HSMMC_##reg) +struct omap_hsmmc_next { + unsigned int dma_len; + s32 cookie; +}; + struct omap_hsmmc_host { struct device *dev; struct mmc_host *mmc; @@ -148,7 +153,6 @@ struct omap_hsmmc_host { struct mmc_command *cmd; struct mmc_data *data; struct clk *fclk; - struct clk *iclk; struct clk *dbclk; /* * vcc == configured supply @@ -184,6 +188,7 @@ struct omap_hsmmc_host { int reqs_blocked; int use_reg; int req_in_progress; + struct omap_hsmmc_next next_data; struct omap_mmc_platform_data *pdata; }; @@ -547,6 +552,15 @@ static void omap_hsmmc_gpio_free(struct omap_mmc_platform_data *pdata) gpio_free(pdata->slots[0].switch_pin); } +/* + * Start clock to the card + */ +static void omap_hsmmc_start_clock(struct omap_hsmmc_host *host) +{ + OMAP_HSMMC_WRITE(host->base, SYSCTL, + OMAP_HSMMC_READ(host->base, SYSCTL) | CEN); +} + /* * Stop clock to the card */ @@ -584,6 +598,81 @@ static void omap_hsmmc_disable_irq(struct omap_hsmmc_host *host) OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); } +/* Calculate divisor for the given clock frequency */ +static u16 calc_divisor(struct mmc_ios *ios) +{ + u16 dsor = 0; + + if (ios->clock) { + dsor = DIV_ROUND_UP(OMAP_MMC_MASTER_CLOCK, ios->clock); + if (dsor > 250) + dsor = 250; + } + + return dsor; +} + +static void omap_hsmmc_set_clock(struct omap_hsmmc_host *host) +{ + struct mmc_ios *ios = &host->mmc->ios; + unsigned long regval; + unsigned long timeout; + + dev_dbg(mmc_dev(host->mmc), "Set clock to %uHz\n", ios->clock); + + omap_hsmmc_stop_clock(host); + + regval = OMAP_HSMMC_READ(host->base, SYSCTL); + regval = regval & ~(CLKD_MASK | DTO_MASK); + regval = regval | (calc_divisor(ios) << 6) | (DTO << 16); + OMAP_HSMMC_WRITE(host->base, SYSCTL, regval); + OMAP_HSMMC_WRITE(host->base, SYSCTL, + OMAP_HSMMC_READ(host->base, SYSCTL) | ICE); + + /* Wait till the ICS bit is set */ + timeout = jiffies + msecs_to_jiffies(MMC_TIMEOUT_MS); + while ((OMAP_HSMMC_READ(host->base, SYSCTL) & ICS) != ICS + && time_before(jiffies, timeout)) + cpu_relax(); + + omap_hsmmc_start_clock(host); +} + +static void omap_hsmmc_set_bus_width(struct omap_hsmmc_host *host) +{ + struct mmc_ios *ios = &host->mmc->ios; + u32 con; + + con = OMAP_HSMMC_READ(host->base, CON); + switch (ios->bus_width) { + case MMC_BUS_WIDTH_8: + OMAP_HSMMC_WRITE(host->base, CON, con | DW8); + break; + case MMC_BUS_WIDTH_4: + OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); + OMAP_HSMMC_WRITE(host->base, HCTL, + OMAP_HSMMC_READ(host->base, HCTL) | FOUR_BIT); + break; + case MMC_BUS_WIDTH_1: + OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); + OMAP_HSMMC_WRITE(host->base, HCTL, + OMAP_HSMMC_READ(host->base, HCTL) & ~FOUR_BIT); + break; + } +} + +static void omap_hsmmc_set_bus_mode(struct omap_hsmmc_host *host) +{ + struct mmc_ios *ios = &host->mmc->ios; + u32 con; + + con = OMAP_HSMMC_READ(host->base, CON); + if (ios->bus_mode == MMC_BUSMODE_OPENDRAIN) + OMAP_HSMMC_WRITE(host->base, CON, con | OD); + else + OMAP_HSMMC_WRITE(host->base, CON, con & ~OD); +} + #ifdef CONFIG_PM /* @@ -595,8 +684,7 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) struct mmc_ios *ios = &host->mmc->ios; struct omap_mmc_platform_data *pdata = host->pdata; int context_loss = 0; - u32 hctl, capa, con; - u16 dsor = 0; + u32 hctl, capa; unsigned long timeout; if (pdata->get_context_loss_count) { @@ -658,54 +746,12 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) if (host->power_mode == MMC_POWER_OFF) goto out; - con = OMAP_HSMMC_READ(host->base, CON); - switch (ios->bus_width) { - case MMC_BUS_WIDTH_8: - OMAP_HSMMC_WRITE(host->base, CON, con | DW8); - break; - case MMC_BUS_WIDTH_4: - OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) | FOUR_BIT); - break; - case MMC_BUS_WIDTH_1: - OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) & ~FOUR_BIT); - break; - } - - if (ios->clock) { - dsor = OMAP_MMC_MASTER_CLOCK / ios->clock; - if (dsor < 1) - dsor = 1; - - if (OMAP_MMC_MASTER_CLOCK / dsor > ios->clock) - dsor++; - - if (dsor > 250) - dsor = 250; - } - - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) & ~CEN); - OMAP_HSMMC_WRITE(host->base, SYSCTL, (dsor << 6) | (DTO << 16)); - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) | ICE); + omap_hsmmc_set_bus_width(host); - timeout = jiffies + msecs_to_jiffies(MMC_TIMEOUT_MS); - while ((OMAP_HSMMC_READ(host->base, SYSCTL) & ICS) != ICS - && time_before(jiffies, timeout)) - ; + omap_hsmmc_set_clock(host); - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) | CEN); + omap_hsmmc_set_bus_mode(host); - con = OMAP_HSMMC_READ(host->base, CON); - if (ios->bus_mode == MMC_BUSMODE_OPENDRAIN) - OMAP_HSMMC_WRITE(host->base, CON, con | OD); - else - OMAP_HSMMC_WRITE(host->base, CON, con & ~OD); out: host->context_loss = context_loss; @@ -973,14 +1019,14 @@ static void omap_hsmmc_dma_cleanup(struct omap_hsmmc_host *host, int errno) * Readable error output */ #ifdef CONFIG_MMC_DEBUG -static void omap_hsmmc_report_irq(struct omap_hsmmc_host *host, u32 status) +static void omap_hsmmc_dbg_report_irq(struct omap_hsmmc_host *host, u32 status) { /* --- means reserved bit without definition at documentation */ static const char *omap_hsmmc_status_bits[] = { - "CC", "TC", "BGE", "---", "BWR", "BRR", "---", "---", "CIRQ", - "OBI", "---", "---", "---", "---", "---", "ERRI", "CTO", "CCRC", - "CEB", "CIE", "DTO", "DCRC", "DEB", "---", "ACE", "---", - "---", "---", "---", "CERR", "CERR", "BADA", "---", "---", "---" + "CC" , "TC" , "BGE", "---", "BWR" , "BRR" , "---" , "---" , + "CIRQ", "OBI" , "---", "---", "---" , "---" , "---" , "ERRI", + "CTO" , "CCRC", "CEB", "CIE", "DTO" , "DCRC", "DEB" , "---" , + "ACE" , "---" , "---", "---", "CERR", "BADA", "---" , "---" }; char res[256]; char *buf = res; @@ -997,6 +1043,11 @@ static void omap_hsmmc_report_irq(struct omap_hsmmc_host *host, u32 status) dev_dbg(mmc_dev(host->mmc), "%s\n", res); } +#else +static inline void omap_hsmmc_dbg_report_irq(struct omap_hsmmc_host *host, + u32 status) +{ +} #endif /* CONFIG_MMC_DEBUG */ /* @@ -1055,9 +1106,7 @@ static void omap_hsmmc_do_irq(struct omap_hsmmc_host *host, int status) dev_dbg(mmc_dev(host->mmc), "IRQ Status is %x\n", status); if (status & ERR) { -#ifdef CONFIG_MMC_DEBUG - omap_hsmmc_report_irq(host, status); -#endif + omap_hsmmc_dbg_report_irq(host, status); if ((status & CMD_TIMEOUT) || (status & CMD_CRC)) { if (host->cmd) { @@ -1155,8 +1204,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) int ret; /* Disable the clocks */ - clk_disable(host->fclk); - clk_disable(host->iclk); + pm_runtime_put_sync(host->dev); if (host->got_dbclk) clk_disable(host->dbclk); @@ -1167,8 +1215,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) if (!ret) ret = mmc_slot(host).set_power(host->dev, host->slot_id, 1, vdd); - clk_enable(host->iclk); - clk_enable(host->fclk); + pm_runtime_get_sync(host->dev); if (host->got_dbclk) clk_enable(host->dbclk); @@ -1322,7 +1369,7 @@ static void omap_hsmmc_config_dma_params(struct omap_hsmmc_host *host, static void omap_hsmmc_dma_cb(int lch, u16 ch_status, void *cb_data) { struct omap_hsmmc_host *host = cb_data; - struct mmc_data *data = host->mrq->data; + struct mmc_data *data; int dma_ch, req_in_progress; if (!(ch_status & OMAP_DMA_BLOCK_IRQ)) { @@ -1337,6 +1384,7 @@ static void omap_hsmmc_dma_cb(int lch, u16 ch_status, void *cb_data) return; } + data = host->mrq->data; host->dma_sg_idx++; if (host->dma_sg_idx < host->dma_len) { /* Fire up the next transfer. */ @@ -1346,8 +1394,9 @@ static void omap_hsmmc_dma_cb(int lch, u16 ch_status, void *cb_data) return; } - dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, - omap_hsmmc_get_dma_dir(host, data)); + if (!data->host_cookie) + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, + omap_hsmmc_get_dma_dir(host, data)); req_in_progress = host->req_in_progress; dma_ch = host->dma_ch; @@ -1365,6 +1414,45 @@ static void omap_hsmmc_dma_cb(int lch, u16 ch_status, void *cb_data) } } +static int omap_hsmmc_pre_dma_transfer(struct omap_hsmmc_host *host, + struct mmc_data *data, + struct omap_hsmmc_next *next) +{ + int dma_len; + + if (!next && data->host_cookie && + data->host_cookie != host->next_data.cookie) { + printk(KERN_WARNING "[%s] invalid cookie: data->host_cookie %d" + " host->next_data.cookie %d\n", + __func__, data->host_cookie, host->next_data.cookie); + data->host_cookie = 0; + } + + /* Check if next job is already prepared */ + if (next || + (!next && data->host_cookie != host->next_data.cookie)) { + dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, + data->sg_len, + omap_hsmmc_get_dma_dir(host, data)); + + } else { + dma_len = host->next_data.dma_len; + host->next_data.dma_len = 0; + } + + + if (dma_len == 0) + return -EINVAL; + + if (next) { + next->dma_len = dma_len; + data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie; + } else + host->dma_len = dma_len; + + return 0; +} + /* * Routine to configure and start DMA for the MMC card */ @@ -1398,9 +1486,10 @@ static int omap_hsmmc_start_dma_transfer(struct omap_hsmmc_host *host, mmc_hostname(host->mmc), ret); return ret; } + ret = omap_hsmmc_pre_dma_transfer(host, data, NULL); + if (ret) + return ret; - host->dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, - data->sg_len, omap_hsmmc_get_dma_dir(host, data)); host->dma_ch = dma_ch; host->dma_sg_idx = 0; @@ -1480,6 +1569,35 @@ omap_hsmmc_prepare_data(struct omap_hsmmc_host *host, struct mmc_request *req) return 0; } +static void omap_hsmmc_post_req(struct mmc_host *mmc, struct mmc_request *mrq, + int err) +{ + struct omap_hsmmc_host *host = mmc_priv(mmc); + struct mmc_data *data = mrq->data; + + if (host->use_dma) { + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, + omap_hsmmc_get_dma_dir(host, data)); + data->host_cookie = 0; + } +} + +static void omap_hsmmc_pre_req(struct mmc_host *mmc, struct mmc_request *mrq, + bool is_first_req) +{ + struct omap_hsmmc_host *host = mmc_priv(mmc); + + if (mrq->data->host_cookie) { + mrq->data->host_cookie = 0; + return ; + } + + if (host->use_dma) + if (omap_hsmmc_pre_dma_transfer(host, mrq->data, + &host->next_data)) + mrq->data->host_cookie = 0; +} + /* * Request function. for read/write operation */ @@ -1528,13 +1646,9 @@ static void omap_hsmmc_request(struct mmc_host *mmc, struct mmc_request *req) static void omap_hsmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) { struct omap_hsmmc_host *host = mmc_priv(mmc); - u16 dsor = 0; - unsigned long regval; - unsigned long timeout; - u32 con; int do_send_init_stream = 0; - mmc_host_enable(host->mmc); + pm_runtime_get_sync(host->dev); if (ios->power_mode != host->power_mode) { switch (ios->power_mode) { @@ -1557,22 +1671,7 @@ static void omap_hsmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) /* FIXME: set registers based only on changes to ios */ - con = OMAP_HSMMC_READ(host->base, CON); - switch (mmc->ios.bus_width) { - case MMC_BUS_WIDTH_8: - OMAP_HSMMC_WRITE(host->base, CON, con | DW8); - break; - case MMC_BUS_WIDTH_4: - OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) | FOUR_BIT); - break; - case MMC_BUS_WIDTH_1: - OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) & ~FOUR_BIT); - break; - } + omap_hsmmc_set_bus_width(host); if (host->pdata->controller_flags & OMAP_HSMMC_SUPPORTS_DUAL_VOLT) { /* Only MMC1 can interface at 3V without some flavor @@ -1592,47 +1691,14 @@ static void omap_hsmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) } } - if (ios->clock) { - dsor = OMAP_MMC_MASTER_CLOCK / ios->clock; - if (dsor < 1) - dsor = 1; - - if (OMAP_MMC_MASTER_CLOCK / dsor > ios->clock) - dsor++; - - if (dsor > 250) - dsor = 250; - } - omap_hsmmc_stop_clock(host); - regval = OMAP_HSMMC_READ(host->base, SYSCTL); - regval = regval & ~(CLKD_MASK); - regval = regval | (dsor << 6) | (DTO << 16); - OMAP_HSMMC_WRITE(host->base, SYSCTL, regval); - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) | ICE); - - /* Wait till the ICS bit is set */ - timeout = jiffies + msecs_to_jiffies(MMC_TIMEOUT_MS); - while ((OMAP_HSMMC_READ(host->base, SYSCTL) & ICS) != ICS - && time_before(jiffies, timeout)) - msleep(1); - - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) | CEN); + omap_hsmmc_set_clock(host); if (do_send_init_stream) send_init_stream(host); - con = OMAP_HSMMC_READ(host->base, CON); - if (ios->bus_mode == MMC_BUSMODE_OPENDRAIN) - OMAP_HSMMC_WRITE(host->base, CON, con | OD); - else - OMAP_HSMMC_WRITE(host->base, CON, con & ~OD); + omap_hsmmc_set_bus_mode(host); - if (host->power_mode == MMC_POWER_OFF) - mmc_host_disable(host->mmc); - else - mmc_host_lazy_disable(host->mmc); + pm_runtime_put_autosuspend(host->dev); } static int omap_hsmmc_get_cd(struct mmc_host *mmc) @@ -1688,230 +1754,12 @@ static void omap_hsmmc_conf_bus_power(struct omap_hsmmc_host *host) set_sd_bus_power(host); } -/* - * Dynamic power saving handling, FSM: - * ENABLED -> DISABLED -> CARDSLEEP / REGSLEEP -> OFF - * ^___________| | | - * |______________________|______________________| - * - * ENABLED: mmc host is fully functional - * DISABLED: fclk is off - * CARDSLEEP: fclk is off, card is asleep, voltage regulator is asleep - * REGSLEEP: fclk is off, voltage regulator is asleep - * OFF: fclk is off, voltage regulator is off - * - * Transition handlers return the timeout for the next state transition - * or negative error. - */ - -enum {ENABLED = 0, DISABLED, CARDSLEEP, REGSLEEP, OFF}; - -/* Handler for [ENABLED -> DISABLED] transition */ -static int omap_hsmmc_enabled_to_disabled(struct omap_hsmmc_host *host) -{ - omap_hsmmc_context_save(host); - clk_disable(host->fclk); - host->dpm_state = DISABLED; - - dev_dbg(mmc_dev(host->mmc), "ENABLED -> DISABLED\n"); - - if (host->power_mode == MMC_POWER_OFF) - return 0; - - return OMAP_MMC_SLEEP_TIMEOUT; -} - -/* Handler for [DISABLED -> REGSLEEP / CARDSLEEP] transition */ -static int omap_hsmmc_disabled_to_sleep(struct omap_hsmmc_host *host) -{ - int err, new_state; - - if (!mmc_try_claim_host(host->mmc)) - return 0; - - clk_enable(host->fclk); - omap_hsmmc_context_restore(host); - if (mmc_card_can_sleep(host->mmc)) { - err = mmc_card_sleep(host->mmc); - if (err < 0) { - clk_disable(host->fclk); - mmc_release_host(host->mmc); - return err; - } - new_state = CARDSLEEP; - } else { - new_state = REGSLEEP; - } - if (mmc_slot(host).set_sleep) - mmc_slot(host).set_sleep(host->dev, host->slot_id, 1, 0, - new_state == CARDSLEEP); - /* FIXME: turn off bus power and perhaps interrupts too */ - clk_disable(host->fclk); - host->dpm_state = new_state; - - mmc_release_host(host->mmc); - - dev_dbg(mmc_dev(host->mmc), "DISABLED -> %s\n", - host->dpm_state == CARDSLEEP ? "CARDSLEEP" : "REGSLEEP"); - - if (mmc_slot(host).no_off) - return 0; - - if ((host->mmc->caps & MMC_CAP_NONREMOVABLE) || - mmc_slot(host).card_detect || - (mmc_slot(host).get_cover_state && - mmc_slot(host).get_cover_state(host->dev, host->slot_id))) - return OMAP_MMC_OFF_TIMEOUT; - - return 0; -} - -/* Handler for [REGSLEEP / CARDSLEEP -> OFF] transition */ -static int omap_hsmmc_sleep_to_off(struct omap_hsmmc_host *host) -{ - if (!mmc_try_claim_host(host->mmc)) - return 0; - - if (mmc_slot(host).no_off) - return 0; - - if (!((host->mmc->caps & MMC_CAP_NONREMOVABLE) || - mmc_slot(host).card_detect || - (mmc_slot(host).get_cover_state && - mmc_slot(host).get_cover_state(host->dev, host->slot_id)))) { - mmc_release_host(host->mmc); - return 0; - } - - mmc_slot(host).set_power(host->dev, host->slot_id, 0, 0); - host->vdd = 0; - host->power_mode = MMC_POWER_OFF; - - dev_dbg(mmc_dev(host->mmc), "%s -> OFF\n", - host->dpm_state == CARDSLEEP ? "CARDSLEEP" : "REGSLEEP"); - - host->dpm_state = OFF; - - mmc_release_host(host->mmc); - - return 0; -} - -/* Handler for [DISABLED -> ENABLED] transition */ -static int omap_hsmmc_disabled_to_enabled(struct omap_hsmmc_host *host) -{ - int err; - - err = clk_enable(host->fclk); - if (err < 0) - return err; - - omap_hsmmc_context_restore(host); - host->dpm_state = ENABLED; - - dev_dbg(mmc_dev(host->mmc), "DISABLED -> ENABLED\n"); - - return 0; -} - -/* Handler for [SLEEP -> ENABLED] transition */ -static int omap_hsmmc_sleep_to_enabled(struct omap_hsmmc_host *host) -{ - if (!mmc_try_claim_host(host->mmc)) - return 0; - - clk_enable(host->fclk); - omap_hsmmc_context_restore(host); - if (mmc_slot(host).set_sleep) - mmc_slot(host).set_sleep(host->dev, host->slot_id, 0, - host->vdd, host->dpm_state == CARDSLEEP); - if (mmc_card_can_sleep(host->mmc)) - mmc_card_awake(host->mmc); - - dev_dbg(mmc_dev(host->mmc), "%s -> ENABLED\n", - host->dpm_state == CARDSLEEP ? "CARDSLEEP" : "REGSLEEP"); - - host->dpm_state = ENABLED; - - mmc_release_host(host->mmc); - - return 0; -} - -/* Handler for [OFF -> ENABLED] transition */ -static int omap_hsmmc_off_to_enabled(struct omap_hsmmc_host *host) -{ - clk_enable(host->fclk); - - omap_hsmmc_context_restore(host); - omap_hsmmc_conf_bus_power(host); - mmc_power_restore_host(host->mmc); - - host->dpm_state = ENABLED; - - dev_dbg(mmc_dev(host->mmc), "OFF -> ENABLED\n"); - - return 0; -} - -/* - * Bring MMC host to ENABLED from any other PM state. - */ -static int omap_hsmmc_enable(struct mmc_host *mmc) -{ - struct omap_hsmmc_host *host = mmc_priv(mmc); - - switch (host->dpm_state) { - case DISABLED: - return omap_hsmmc_disabled_to_enabled(host); - case CARDSLEEP: - case REGSLEEP: - return omap_hsmmc_sleep_to_enabled(host); - case OFF: - return omap_hsmmc_off_to_enabled(host); - default: - dev_dbg(mmc_dev(host->mmc), "UNKNOWN state\n"); - return -EINVAL; - } -} - -/* - * Bring MMC host in PM state (one level deeper). - */ -static int omap_hsmmc_disable(struct mmc_host *mmc, int lazy) -{ - struct omap_hsmmc_host *host = mmc_priv(mmc); - - switch (host->dpm_state) { - case ENABLED: { - int delay; - - delay = omap_hsmmc_enabled_to_disabled(host); - if (lazy || delay < 0) - return delay; - return 0; - } - case DISABLED: - return omap_hsmmc_disabled_to_sleep(host); - case CARDSLEEP: - case REGSLEEP: - return omap_hsmmc_sleep_to_off(host); - default: - dev_dbg(mmc_dev(host->mmc), "UNKNOWN state\n"); - return -EINVAL; - } -} - static int omap_hsmmc_enable_fclk(struct mmc_host *mmc) { struct omap_hsmmc_host *host = mmc_priv(mmc); - int err; - err = clk_enable(host->fclk); - if (err) - return err; - dev_dbg(mmc_dev(host->mmc), "mmc_fclk: enabled\n"); - omap_hsmmc_context_restore(host); + pm_runtime_get_sync(host->dev); + return 0; } @@ -1919,26 +1767,17 @@ static int omap_hsmmc_disable_fclk(struct mmc_host *mmc, int lazy) { struct omap_hsmmc_host *host = mmc_priv(mmc); - omap_hsmmc_context_save(host); - clk_disable(host->fclk); - dev_dbg(mmc_dev(host->mmc), "mmc_fclk: disabled\n"); + pm_runtime_mark_last_busy(host->dev); + pm_runtime_put_autosuspend(host->dev); + return 0; } static const struct mmc_host_ops omap_hsmmc_ops = { .enable = omap_hsmmc_enable_fclk, .disable = omap_hsmmc_disable_fclk, - .request = omap_hsmmc_request, - .set_ios = omap_hsmmc_set_ios, - .get_cd = omap_hsmmc_get_cd, - .get_ro = omap_hsmmc_get_ro, - .init_card = omap_hsmmc_init_card, - /* NYET -- enable_sdio_irq */ -}; - -static const struct mmc_host_ops omap_hsmmc_ps_ops = { - .enable = omap_hsmmc_enable, - .disable = omap_hsmmc_disable, + .post_req = omap_hsmmc_post_req, + .pre_req = omap_hsmmc_pre_req, .request = omap_hsmmc_request, .set_ios = omap_hsmmc_set_ios, .get_cd = omap_hsmmc_get_cd, @@ -1968,15 +1807,12 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data) host->dpm_state, mmc->nesting_cnt, host->context_loss, context_loss); - if (host->suspended || host->dpm_state == OFF) { + if (host->suspended) { seq_printf(s, "host suspended, can't read registers\n"); return 0; } - if (clk_enable(host->fclk) != 0) { - seq_printf(s, "can't read the regs\n"); - return 0; - } + pm_runtime_get_sync(host->dev); seq_printf(s, "SYSCONFIG:\t0x%08x\n", OMAP_HSMMC_READ(host->base, SYSCONFIG)); @@ -1993,7 +1829,8 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data) seq_printf(s, "CAPA:\t\t0x%08x\n", OMAP_HSMMC_READ(host->base, CAPA)); - clk_disable(host->fclk); + pm_runtime_mark_last_busy(host->dev); + pm_runtime_put_autosuspend(host->dev); return 0; } @@ -2077,14 +1914,12 @@ static int __init omap_hsmmc_probe(struct platform_device *pdev) host->mapbase = res->start; host->base = ioremap(host->mapbase, SZ_4K); host->power_mode = MMC_POWER_OFF; + host->next_data.cookie = 1; platform_set_drvdata(pdev, host); INIT_WORK(&host->mmc_carddetect_work, omap_hsmmc_detect); - if (mmc_slot(host).power_saving) - mmc->ops = &omap_hsmmc_ps_ops; - else - mmc->ops = &omap_hsmmc_ops; + mmc->ops = &omap_hsmmc_ops; /* * If regulator_disable can only put vcc_aux to sleep then there is @@ -2093,44 +1928,26 @@ static int __init omap_hsmmc_probe(struct platform_device *pdev) if (mmc_slot(host).vcc_aux_disable_is_sleep) mmc_slot(host).no_off = 1; - mmc->f_min = 400000; - mmc->f_max = 52000000; + mmc->f_min = OMAP_MMC_MIN_CLOCK; + mmc->f_max = OMAP_MMC_MAX_CLOCK; spin_lock_init(&host->irq_lock); - host->iclk = clk_get(&pdev->dev, "ick"); - if (IS_ERR(host->iclk)) { - ret = PTR_ERR(host->iclk); - host->iclk = NULL; - goto err1; - } host->fclk = clk_get(&pdev->dev, "fck"); if (IS_ERR(host->fclk)) { ret = PTR_ERR(host->fclk); host->fclk = NULL; - clk_put(host->iclk); goto err1; } omap_hsmmc_context_save(host); mmc->caps |= MMC_CAP_DISABLE; - mmc_set_disable_delay(mmc, OMAP_MMC_DISABLED_TIMEOUT); - /* we start off in DISABLED state */ - host->dpm_state = DISABLED; - if (clk_enable(host->iclk) != 0) { - clk_put(host->iclk); - clk_put(host->fclk); - goto err1; - } - - if (mmc_host_enable(host->mmc) != 0) { - clk_disable(host->iclk); - clk_put(host->iclk); - clk_put(host->fclk); - goto err1; - } + pm_runtime_enable(host->dev); + pm_runtime_get_sync(host->dev); + pm_runtime_set_autosuspend_delay(host->dev, MMC_AUTOSUSPEND_DELAY); + pm_runtime_use_autosuspend(host->dev); if (cpu_is_omap2430()) { host->dbclk = clk_get(&pdev->dev, "mmchsdb_fck"); @@ -2240,8 +2057,6 @@ static int __init omap_hsmmc_probe(struct platform_device *pdev) omap_hsmmc_disable_irq(host); - mmc_host_lazy_disable(host->mmc); - omap_hsmmc_protect_card(host); mmc_add_host(mmc); @@ -2259,6 +2074,8 @@ static int __init omap_hsmmc_probe(struct platform_device *pdev) } omap_hsmmc_debugfs(mmc); + pm_runtime_mark_last_busy(host->dev); + pm_runtime_put_autosuspend(host->dev); return 0; @@ -2274,10 +2091,9 @@ err_reg: err_irq_cd_init: free_irq(host->irq, host); err_irq: - mmc_host_disable(host->mmc); - clk_disable(host->iclk); + pm_runtime_mark_last_busy(host->dev); + pm_runtime_put_autosuspend(host->dev); clk_put(host->fclk); - clk_put(host->iclk); if (host->got_dbclk) { clk_disable(host->dbclk); clk_put(host->dbclk); @@ -2299,7 +2115,7 @@ static int omap_hsmmc_remove(struct platform_device *pdev) struct resource *res; if (host) { - mmc_host_enable(host->mmc); + pm_runtime_get_sync(host->dev); mmc_remove_host(host->mmc); if (host->use_reg) omap_hsmmc_reg_put(host); @@ -2310,10 +2126,9 @@ static int omap_hsmmc_remove(struct platform_device *pdev) free_irq(mmc_slot(host).card_detect_irq, host); flush_work_sync(&host->mmc_carddetect_work); - mmc_host_disable(host->mmc); - clk_disable(host->iclk); + pm_runtime_put_sync(host->dev); + pm_runtime_disable(host->dev); clk_put(host->fclk); - clk_put(host->iclk); if (host->got_dbclk) { clk_disable(host->dbclk); clk_put(host->dbclk); @@ -2343,6 +2158,7 @@ static int omap_hsmmc_suspend(struct device *dev) return 0; if (host) { + pm_runtime_get_sync(host->dev); host->suspended = 1; if (host->pdata->suspend) { ret = host->pdata->suspend(&pdev->dev, @@ -2357,13 +2173,11 @@ static int omap_hsmmc_suspend(struct device *dev) } cancel_work_sync(&host->mmc_carddetect_work); ret = mmc_suspend_host(host->mmc); - mmc_host_enable(host->mmc); + if (ret == 0) { omap_hsmmc_disable_irq(host); OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP); - mmc_host_disable(host->mmc); - clk_disable(host->iclk); if (host->got_dbclk) clk_disable(host->dbclk); } else { @@ -2375,9 +2189,8 @@ static int omap_hsmmc_suspend(struct device *dev) dev_dbg(mmc_dev(host->mmc), "Unmask interrupt failed\n"); } - mmc_host_disable(host->mmc); } - + pm_runtime_put_sync(host->dev); } return ret; } @@ -2393,14 +2206,7 @@ static int omap_hsmmc_resume(struct device *dev) return 0; if (host) { - ret = clk_enable(host->iclk); - if (ret) - goto clk_en_err; - - if (mmc_host_enable(host->mmc) != 0) { - clk_disable(host->iclk); - goto clk_en_err; - } + pm_runtime_get_sync(host->dev); if (host->got_dbclk) clk_enable(host->dbclk); @@ -2421,15 +2227,12 @@ static int omap_hsmmc_resume(struct device *dev) if (ret == 0) host->suspended = 0; - mmc_host_lazy_disable(host->mmc); + pm_runtime_mark_last_busy(host->dev); + pm_runtime_put_autosuspend(host->dev); } return ret; -clk_en_err: - dev_dbg(mmc_dev(host->mmc), - "Failed to enable MMC clocks during resume\n"); - return ret; } #else @@ -2437,9 +2240,33 @@ clk_en_err: #define omap_hsmmc_resume NULL #endif +static int omap_hsmmc_runtime_suspend(struct device *dev) +{ + struct omap_hsmmc_host *host; + + host = platform_get_drvdata(to_platform_device(dev)); + omap_hsmmc_context_save(host); + dev_dbg(mmc_dev(host->mmc), "disabled\n"); + + return 0; +} + +static int omap_hsmmc_runtime_resume(struct device *dev) +{ + struct omap_hsmmc_host *host; + + host = platform_get_drvdata(to_platform_device(dev)); + omap_hsmmc_context_restore(host); + dev_dbg(mmc_dev(host->mmc), "enabled\n"); + + return 0; +} + static struct dev_pm_ops omap_hsmmc_dev_pm_ops = { .suspend = omap_hsmmc_suspend, .resume = omap_hsmmc_resume, + .runtime_suspend = omap_hsmmc_runtime_suspend, + .runtime_resume = omap_hsmmc_runtime_resume, }; static struct platform_driver omap_hsmmc_driver = { diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c index 9ebd1d7759dc..4b920b7621cf 100644 --- a/drivers/mmc/host/sdhci-cns3xxx.c +++ b/drivers/mmc/host/sdhci-cns3xxx.c @@ -15,9 +15,7 @@ #include #include #include -#include #include -#include "sdhci.h" #include "sdhci-pltfm.h" static unsigned int sdhci_cns3xxx_get_max_clk(struct sdhci_host *host) @@ -86,7 +84,7 @@ static struct sdhci_ops sdhci_cns3xxx_ops = { .set_clock = sdhci_cns3xxx_set_clock, }; -struct sdhci_pltfm_data sdhci_cns3xxx_pdata = { +static struct sdhci_pltfm_data sdhci_cns3xxx_pdata = { .ops = &sdhci_cns3xxx_ops, .quirks = SDHCI_QUIRK_BROKEN_DMA | SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | @@ -95,3 +93,43 @@ struct sdhci_pltfm_data sdhci_cns3xxx_pdata = { SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_NONSTANDARD_CLOCK, }; + +static int __devinit sdhci_cns3xxx_probe(struct platform_device *pdev) +{ + return sdhci_pltfm_register(pdev, &sdhci_cns3xxx_pdata); +} + +static int __devexit sdhci_cns3xxx_remove(struct platform_device *pdev) +{ + return sdhci_pltfm_unregister(pdev); +} + +static struct platform_driver sdhci_cns3xxx_driver = { + .driver = { + .name = "sdhci-cns3xxx", + .owner = THIS_MODULE, + }, + .probe = sdhci_cns3xxx_probe, + .remove = __devexit_p(sdhci_cns3xxx_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif +}; + +static int __init sdhci_cns3xxx_init(void) +{ + return platform_driver_register(&sdhci_cns3xxx_driver); +} +module_init(sdhci_cns3xxx_init); + +static void __exit sdhci_cns3xxx_exit(void) +{ + platform_driver_unregister(&sdhci_cns3xxx_driver); +} +module_exit(sdhci_cns3xxx_exit); + +MODULE_DESCRIPTION("SDHCI driver for CNS3xxx"); +MODULE_AUTHOR("Scott Shu, " + "Anton Vorontsov "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c index 2aeef4ffed8c..f2d29dca4420 100644 --- a/drivers/mmc/host/sdhci-dove.c +++ b/drivers/mmc/host/sdhci-dove.c @@ -22,7 +22,6 @@ #include #include -#include "sdhci.h" #include "sdhci-pltfm.h" static u16 sdhci_dove_readw(struct sdhci_host *host, int reg) @@ -61,10 +60,50 @@ static struct sdhci_ops sdhci_dove_ops = { .read_l = sdhci_dove_readl, }; -struct sdhci_pltfm_data sdhci_dove_pdata = { +static struct sdhci_pltfm_data sdhci_dove_pdata = { .ops = &sdhci_dove_ops, .quirks = SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER | SDHCI_QUIRK_NO_BUSY_IRQ | SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_FORCE_DMA, }; + +static int __devinit sdhci_dove_probe(struct platform_device *pdev) +{ + return sdhci_pltfm_register(pdev, &sdhci_dove_pdata); +} + +static int __devexit sdhci_dove_remove(struct platform_device *pdev) +{ + return sdhci_pltfm_unregister(pdev); +} + +static struct platform_driver sdhci_dove_driver = { + .driver = { + .name = "sdhci-dove", + .owner = THIS_MODULE, + }, + .probe = sdhci_dove_probe, + .remove = __devexit_p(sdhci_dove_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif +}; + +static int __init sdhci_dove_init(void) +{ + return platform_driver_register(&sdhci_dove_driver); +} +module_init(sdhci_dove_init); + +static void __exit sdhci_dove_exit(void) +{ + platform_driver_unregister(&sdhci_dove_driver); +} +module_exit(sdhci_dove_exit); + +MODULE_DESCRIPTION("SDHCI driver for Dove"); +MODULE_AUTHOR("Saeed Bishara , " + "Mike Rapoport "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index a19967d0bfc4..710b706f4fcf 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -18,12 +18,10 @@ #include #include #include -#include #include #include #include #include -#include "sdhci.h" #include "sdhci-pltfm.h" #include "sdhci-esdhc.h" @@ -31,7 +29,7 @@ #define SDHCI_VENDOR_SPEC 0xC0 #define SDHCI_VENDOR_SPEC_SDIO_QUIRK 0x00000002 -#define ESDHC_FLAG_GPIO_FOR_CD_WP (1 << 0) +#define ESDHC_FLAG_GPIO_FOR_CD (1 << 0) /* * The CMDTYPE of the CMD register (offset 0xE) should be set to * "11" when the STOP CMD12 is issued on imx53 to abort one @@ -67,14 +65,14 @@ static u32 esdhc_readl_le(struct sdhci_host *host, int reg) u32 val = readl(host->ioaddr + reg); if (unlikely((reg == SDHCI_PRESENT_STATE) - && (imx_data->flags & ESDHC_FLAG_GPIO_FOR_CD_WP))) { + && (imx_data->flags & ESDHC_FLAG_GPIO_FOR_CD))) { struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data; if (boarddata && gpio_is_valid(boarddata->cd_gpio) && gpio_get_value(boarddata->cd_gpio)) /* no card, if a valid gpio says so... */ - val &= SDHCI_CARD_PRESENT; + val &= ~SDHCI_CARD_PRESENT; else /* ... in all other cases assume card is present */ val |= SDHCI_CARD_PRESENT; @@ -89,7 +87,7 @@ static void esdhc_writel_le(struct sdhci_host *host, u32 val, int reg) struct pltfm_imx_data *imx_data = pltfm_host->priv; if (unlikely((reg == SDHCI_INT_ENABLE || reg == SDHCI_SIGNAL_ENABLE) - && (imx_data->flags & ESDHC_FLAG_GPIO_FOR_CD_WP))) + && (imx_data->flags & ESDHC_FLAG_GPIO_FOR_CD))) /* * these interrupts won't work with a custom card_detect gpio * (only applied to mx25/35) @@ -191,16 +189,6 @@ static unsigned int esdhc_pltfm_get_min_clock(struct sdhci_host *host) return clk_get_rate(pltfm_host->clk) / 256 / 16; } -static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host) -{ - struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data; - - if (boarddata && gpio_is_valid(boarddata->wp_gpio)) - return gpio_get_value(boarddata->wp_gpio); - else - return -ENOSYS; -} - static struct sdhci_ops sdhci_esdhc_ops = { .read_l = esdhc_readl_le, .read_w = esdhc_readw_le, @@ -212,6 +200,24 @@ static struct sdhci_ops sdhci_esdhc_ops = { .get_min_clock = esdhc_pltfm_get_min_clock, }; +static struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = { + .quirks = ESDHC_DEFAULT_QUIRKS | SDHCI_QUIRK_BROKEN_ADMA + | SDHCI_QUIRK_BROKEN_CARD_DETECTION, + /* ADMA has issues. Might be fixable */ + .ops = &sdhci_esdhc_ops, +}; + +static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host) +{ + struct esdhc_platform_data *boarddata = + host->mmc->parent->platform_data; + + if (boarddata && gpio_is_valid(boarddata->wp_gpio)) + return gpio_get_value(boarddata->wp_gpio); + else + return -ENOSYS; +} + static irqreturn_t cd_irq(int irq, void *data) { struct sdhci_host *sdhost = (struct sdhci_host *)data; @@ -220,30 +226,35 @@ static irqreturn_t cd_irq(int irq, void *data) return IRQ_HANDLED; }; -static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pdata) +static int __devinit sdhci_esdhc_imx_probe(struct platform_device *pdev) { - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_host *host; + struct esdhc_platform_data *boarddata; struct clk *clk; int err; struct pltfm_imx_data *imx_data; + host = sdhci_pltfm_init(pdev, &sdhci_esdhc_imx_pdata); + if (IS_ERR(host)) + return PTR_ERR(host); + + pltfm_host = sdhci_priv(host); + + imx_data = kzalloc(sizeof(struct pltfm_imx_data), GFP_KERNEL); + if (!imx_data) + return -ENOMEM; + pltfm_host->priv = imx_data; + clk = clk_get(mmc_dev(host->mmc), NULL); if (IS_ERR(clk)) { dev_err(mmc_dev(host->mmc), "clk err\n"); - return PTR_ERR(clk); + err = PTR_ERR(clk); + goto err_clk_get; } clk_enable(clk); pltfm_host->clk = clk; - imx_data = kzalloc(sizeof(struct pltfm_imx_data), GFP_KERNEL); - if (!imx_data) { - clk_disable(pltfm_host->clk); - clk_put(pltfm_host->clk); - return -ENOMEM; - } - pltfm_host->priv = imx_data; - if (!cpu_is_mx25()) host->quirks |= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL; @@ -257,6 +268,7 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd if (!(cpu_is_mx25() || cpu_is_mx35() || cpu_is_mx51())) imx_data->flags |= ESDHC_FLAG_MULTIBLK_NO_INT; + boarddata = host->mmc->parent->platform_data; if (boarddata) { err = gpio_request_one(boarddata->wp_gpio, GPIOF_IN, "ESDHC_WP"); if (err) { @@ -284,11 +296,15 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd goto no_card_detect_irq; } - imx_data->flags |= ESDHC_FLAG_GPIO_FOR_CD_WP; + imx_data->flags |= ESDHC_FLAG_GPIO_FOR_CD; /* Now we have a working card_detect again */ host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION; } + err = sdhci_add_host(host); + if (err) + goto err_add_host; + return 0; no_card_detect_irq: @@ -297,14 +313,23 @@ static int esdhc_pltfm_init(struct sdhci_host *host, struct sdhci_pltfm_data *pd boarddata->cd_gpio = err; not_supported: kfree(imx_data); - return 0; + err_add_host: + clk_disable(pltfm_host->clk); + clk_put(pltfm_host->clk); + err_clk_get: + sdhci_pltfm_free(pdev); + return err; } -static void esdhc_pltfm_exit(struct sdhci_host *host) +static int __devexit sdhci_esdhc_imx_remove(struct platform_device *pdev) { + struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct esdhc_platform_data *boarddata = host->mmc->parent->platform_data; struct pltfm_imx_data *imx_data = pltfm_host->priv; + int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff); + + sdhci_remove_host(host, dead); if (boarddata && gpio_is_valid(boarddata->wp_gpio)) gpio_free(boarddata->wp_gpio); @@ -319,13 +344,37 @@ static void esdhc_pltfm_exit(struct sdhci_host *host) clk_disable(pltfm_host->clk); clk_put(pltfm_host->clk); kfree(imx_data); + + sdhci_pltfm_free(pdev); + + return 0; } -struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = { - .quirks = ESDHC_DEFAULT_QUIRKS | SDHCI_QUIRK_BROKEN_ADMA - | SDHCI_QUIRK_BROKEN_CARD_DETECTION, - /* ADMA has issues. Might be fixable */ - .ops = &sdhci_esdhc_ops, - .init = esdhc_pltfm_init, - .exit = esdhc_pltfm_exit, +static struct platform_driver sdhci_esdhc_imx_driver = { + .driver = { + .name = "sdhci-esdhc-imx", + .owner = THIS_MODULE, + }, + .probe = sdhci_esdhc_imx_probe, + .remove = __devexit_p(sdhci_esdhc_imx_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif }; + +static int __init sdhci_esdhc_imx_init(void) +{ + return platform_driver_register(&sdhci_esdhc_imx_driver); +} +module_init(sdhci_esdhc_imx_init); + +static void __exit sdhci_esdhc_imx_exit(void) +{ + platform_driver_unregister(&sdhci_esdhc_imx_driver); +} +module_exit(sdhci_esdhc_imx_exit); + +MODULE_DESCRIPTION("SDHCI driver for Freescale i.MX eSDHC"); +MODULE_AUTHOR("Wolfram Sang "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-of-core.c b/drivers/mmc/host/sdhci-of-core.c deleted file mode 100644 index 60e4186a4345..000000000000 --- a/drivers/mmc/host/sdhci-of-core.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * OpenFirmware bindings for Secure Digital Host Controller Interface. - * - * Copyright (c) 2007 Freescale Semiconductor, Inc. - * Copyright (c) 2009 MontaVista Software, Inc. - * - * Authors: Xiaobo Xie - * Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_PPC -#include -#endif -#include "sdhci-of.h" -#include "sdhci.h" - -#ifdef CONFIG_MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER - -/* - * These accessors are designed for big endian hosts doing I/O to - * little endian controllers incorporating a 32-bit hardware byte swapper. - */ - -u32 sdhci_be32bs_readl(struct sdhci_host *host, int reg) -{ - return in_be32(host->ioaddr + reg); -} - -u16 sdhci_be32bs_readw(struct sdhci_host *host, int reg) -{ - return in_be16(host->ioaddr + (reg ^ 0x2)); -} - -u8 sdhci_be32bs_readb(struct sdhci_host *host, int reg) -{ - return in_8(host->ioaddr + (reg ^ 0x3)); -} - -void sdhci_be32bs_writel(struct sdhci_host *host, u32 val, int reg) -{ - out_be32(host->ioaddr + reg, val); -} - -void sdhci_be32bs_writew(struct sdhci_host *host, u16 val, int reg) -{ - struct sdhci_of_host *of_host = sdhci_priv(host); - int base = reg & ~0x3; - int shift = (reg & 0x2) * 8; - - switch (reg) { - case SDHCI_TRANSFER_MODE: - /* - * Postpone this write, we must do it together with a - * command write that is down below. - */ - of_host->xfer_mode_shadow = val; - return; - case SDHCI_COMMAND: - sdhci_be32bs_writel(host, val << 16 | of_host->xfer_mode_shadow, - SDHCI_TRANSFER_MODE); - return; - } - clrsetbits_be32(host->ioaddr + base, 0xffff << shift, val << shift); -} - -void sdhci_be32bs_writeb(struct sdhci_host *host, u8 val, int reg) -{ - int base = reg & ~0x3; - int shift = (reg & 0x3) * 8; - - clrsetbits_be32(host->ioaddr + base , 0xff << shift, val << shift); -} -#endif /* CONFIG_MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER */ - -#ifdef CONFIG_PM - -static int sdhci_of_suspend(struct platform_device *ofdev, pm_message_t state) -{ - struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); - - return mmc_suspend_host(host->mmc); -} - -static int sdhci_of_resume(struct platform_device *ofdev) -{ - struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); - - return mmc_resume_host(host->mmc); -} - -#else - -#define sdhci_of_suspend NULL -#define sdhci_of_resume NULL - -#endif - -static bool __devinit sdhci_of_wp_inverted(struct device_node *np) -{ - if (of_get_property(np, "sdhci,wp-inverted", NULL)) - return true; - - /* Old device trees don't have the wp-inverted property. */ -#ifdef CONFIG_PPC - return machine_is(mpc837x_rdb) || machine_is(mpc837x_mds); -#else - return false; -#endif -} - -static const struct of_device_id sdhci_of_match[]; -static int __devinit sdhci_of_probe(struct platform_device *ofdev) -{ - const struct of_device_id *match; - struct device_node *np = ofdev->dev.of_node; - struct sdhci_of_data *sdhci_of_data; - struct sdhci_host *host; - struct sdhci_of_host *of_host; - const __be32 *clk; - int size; - int ret; - - match = of_match_device(sdhci_of_match, &ofdev->dev); - if (!match) - return -EINVAL; - sdhci_of_data = match->data; - - if (!of_device_is_available(np)) - return -ENODEV; - - host = sdhci_alloc_host(&ofdev->dev, sizeof(*of_host)); - if (IS_ERR(host)) - return -ENOMEM; - - of_host = sdhci_priv(host); - dev_set_drvdata(&ofdev->dev, host); - - host->ioaddr = of_iomap(np, 0); - if (!host->ioaddr) { - ret = -ENOMEM; - goto err_addr_map; - } - - host->irq = irq_of_parse_and_map(np, 0); - if (!host->irq) { - ret = -EINVAL; - goto err_no_irq; - } - - host->hw_name = dev_name(&ofdev->dev); - if (sdhci_of_data) { - host->quirks = sdhci_of_data->quirks; - host->ops = &sdhci_of_data->ops; - } - - if (of_get_property(np, "sdhci,auto-cmd12", NULL)) - host->quirks |= SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12; - - - if (of_get_property(np, "sdhci,1-bit-only", NULL)) - host->quirks |= SDHCI_QUIRK_FORCE_1_BIT_DATA; - - if (sdhci_of_wp_inverted(np)) - host->quirks |= SDHCI_QUIRK_INVERTED_WRITE_PROTECT; - - clk = of_get_property(np, "clock-frequency", &size); - if (clk && size == sizeof(*clk) && *clk) - of_host->clock = be32_to_cpup(clk); - - ret = sdhci_add_host(host); - if (ret) - goto err_add_host; - - return 0; - -err_add_host: - irq_dispose_mapping(host->irq); -err_no_irq: - iounmap(host->ioaddr); -err_addr_map: - sdhci_free_host(host); - return ret; -} - -static int __devexit sdhci_of_remove(struct platform_device *ofdev) -{ - struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); - - sdhci_remove_host(host, 0); - sdhci_free_host(host); - irq_dispose_mapping(host->irq); - iounmap(host->ioaddr); - return 0; -} - -static const struct of_device_id sdhci_of_match[] = { -#ifdef CONFIG_MMC_SDHCI_OF_ESDHC - { .compatible = "fsl,mpc8379-esdhc", .data = &sdhci_esdhc, }, - { .compatible = "fsl,mpc8536-esdhc", .data = &sdhci_esdhc, }, - { .compatible = "fsl,esdhc", .data = &sdhci_esdhc, }, -#endif -#ifdef CONFIG_MMC_SDHCI_OF_HLWD - { .compatible = "nintendo,hollywood-sdhci", .data = &sdhci_hlwd, }, -#endif - { .compatible = "generic-sdhci", }, - {}, -}; -MODULE_DEVICE_TABLE(of, sdhci_of_match); - -static struct platform_driver sdhci_of_driver = { - .driver = { - .name = "sdhci-of", - .owner = THIS_MODULE, - .of_match_table = sdhci_of_match, - }, - .probe = sdhci_of_probe, - .remove = __devexit_p(sdhci_of_remove), - .suspend = sdhci_of_suspend, - .resume = sdhci_of_resume, -}; - -static int __init sdhci_of_init(void) -{ - return platform_driver_register(&sdhci_of_driver); -} -module_init(sdhci_of_init); - -static void __exit sdhci_of_exit(void) -{ - platform_driver_unregister(&sdhci_of_driver); -} -module_exit(sdhci_of_exit); - -MODULE_DESCRIPTION("Secure Digital Host Controller Interface OF driver"); -MODULE_AUTHOR("Xiaobo Xie , " - "Anton Vorontsov "); -MODULE_LICENSE("GPL"); diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index ba40d6d035c7..fe604df65011 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -16,8 +16,7 @@ #include #include #include -#include "sdhci-of.h" -#include "sdhci.h" +#include "sdhci-pltfm.h" #include "sdhci-esdhc.h" static u16 esdhc_readw(struct sdhci_host *host, int reg) @@ -60,32 +59,83 @@ static int esdhc_of_enable_dma(struct sdhci_host *host) static unsigned int esdhc_of_get_max_clock(struct sdhci_host *host) { - struct sdhci_of_host *of_host = sdhci_priv(host); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - return of_host->clock; + return pltfm_host->clock; } static unsigned int esdhc_of_get_min_clock(struct sdhci_host *host) { - struct sdhci_of_host *of_host = sdhci_priv(host); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - return of_host->clock / 256 / 16; + return pltfm_host->clock / 256 / 16; } -struct sdhci_of_data sdhci_esdhc = { +static struct sdhci_ops sdhci_esdhc_ops = { + .read_l = sdhci_be32bs_readl, + .read_w = esdhc_readw, + .read_b = sdhci_be32bs_readb, + .write_l = sdhci_be32bs_writel, + .write_w = esdhc_writew, + .write_b = esdhc_writeb, + .set_clock = esdhc_set_clock, + .enable_dma = esdhc_of_enable_dma, + .get_max_clock = esdhc_of_get_max_clock, + .get_min_clock = esdhc_of_get_min_clock, +}; + +static struct sdhci_pltfm_data sdhci_esdhc_pdata = { /* card detection could be handled via GPIO */ .quirks = ESDHC_DEFAULT_QUIRKS | SDHCI_QUIRK_BROKEN_CARD_DETECTION | SDHCI_QUIRK_NO_CARD_NO_RESET, - .ops = { - .read_l = sdhci_be32bs_readl, - .read_w = esdhc_readw, - .read_b = sdhci_be32bs_readb, - .write_l = sdhci_be32bs_writel, - .write_w = esdhc_writew, - .write_b = esdhc_writeb, - .set_clock = esdhc_set_clock, - .enable_dma = esdhc_of_enable_dma, - .get_max_clock = esdhc_of_get_max_clock, - .get_min_clock = esdhc_of_get_min_clock, + .ops = &sdhci_esdhc_ops, +}; + +static int __devinit sdhci_esdhc_probe(struct platform_device *pdev) +{ + return sdhci_pltfm_register(pdev, &sdhci_esdhc_pdata); +} + +static int __devexit sdhci_esdhc_remove(struct platform_device *pdev) +{ + return sdhci_pltfm_unregister(pdev); +} + +static const struct of_device_id sdhci_esdhc_of_match[] = { + { .compatible = "fsl,mpc8379-esdhc" }, + { .compatible = "fsl,mpc8536-esdhc" }, + { .compatible = "fsl,esdhc" }, + { } +}; +MODULE_DEVICE_TABLE(of, sdhci_esdhc_of_match); + +static struct platform_driver sdhci_esdhc_driver = { + .driver = { + .name = "sdhci-esdhc", + .owner = THIS_MODULE, + .of_match_table = sdhci_esdhc_of_match, }, + .probe = sdhci_esdhc_probe, + .remove = __devexit_p(sdhci_esdhc_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif }; + +static int __init sdhci_esdhc_init(void) +{ + return platform_driver_register(&sdhci_esdhc_driver); +} +module_init(sdhci_esdhc_init); + +static void __exit sdhci_esdhc_exit(void) +{ + platform_driver_unregister(&sdhci_esdhc_driver); +} +module_exit(sdhci_esdhc_exit); + +MODULE_DESCRIPTION("SDHCI OF driver for Freescale MPC eSDHC"); +MODULE_AUTHOR("Xiaobo Xie , " + "Anton Vorontsov "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-of-hlwd.c b/drivers/mmc/host/sdhci-of-hlwd.c index 68ddb7546ae2..735be131dca9 100644 --- a/drivers/mmc/host/sdhci-of-hlwd.c +++ b/drivers/mmc/host/sdhci-of-hlwd.c @@ -21,8 +21,7 @@ #include #include -#include "sdhci-of.h" -#include "sdhci.h" +#include "sdhci-pltfm.h" /* * Ops and quirks for the Nintendo Wii SDHCI controllers. @@ -51,15 +50,63 @@ static void sdhci_hlwd_writeb(struct sdhci_host *host, u8 val, int reg) udelay(SDHCI_HLWD_WRITE_DELAY); } -struct sdhci_of_data sdhci_hlwd = { +static struct sdhci_ops sdhci_hlwd_ops = { + .read_l = sdhci_be32bs_readl, + .read_w = sdhci_be32bs_readw, + .read_b = sdhci_be32bs_readb, + .write_l = sdhci_hlwd_writel, + .write_w = sdhci_hlwd_writew, + .write_b = sdhci_hlwd_writeb, +}; + +static struct sdhci_pltfm_data sdhci_hlwd_pdata = { .quirks = SDHCI_QUIRK_32BIT_DMA_ADDR | SDHCI_QUIRK_32BIT_DMA_SIZE, - .ops = { - .read_l = sdhci_be32bs_readl, - .read_w = sdhci_be32bs_readw, - .read_b = sdhci_be32bs_readb, - .write_l = sdhci_hlwd_writel, - .write_w = sdhci_hlwd_writew, - .write_b = sdhci_hlwd_writeb, + .ops = &sdhci_hlwd_ops, +}; + +static int __devinit sdhci_hlwd_probe(struct platform_device *pdev) +{ + return sdhci_pltfm_register(pdev, &sdhci_hlwd_pdata); +} + +static int __devexit sdhci_hlwd_remove(struct platform_device *pdev) +{ + return sdhci_pltfm_unregister(pdev); +} + +static const struct of_device_id sdhci_hlwd_of_match[] = { + { .compatible = "nintendo,hollywood-sdhci" }, + { } +}; +MODULE_DEVICE_TABLE(of, sdhci_hlwd_of_match); + +static struct platform_driver sdhci_hlwd_driver = { + .driver = { + .name = "sdhci-hlwd", + .owner = THIS_MODULE, + .of_match_table = sdhci_hlwd_of_match, }, + .probe = sdhci_hlwd_probe, + .remove = __devexit_p(sdhci_hlwd_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif }; + +static int __init sdhci_hlwd_init(void) +{ + return platform_driver_register(&sdhci_hlwd_driver); +} +module_init(sdhci_hlwd_init); + +static void __exit sdhci_hlwd_exit(void) +{ + platform_driver_unregister(&sdhci_hlwd_driver); +} +module_exit(sdhci_hlwd_exit); + +MODULE_DESCRIPTION("Nintendo Wii SDHCI OF driver"); +MODULE_AUTHOR("The GameCube Linux Team, Albert Herranz"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-of.h b/drivers/mmc/host/sdhci-of.h deleted file mode 100644 index ad09ad9915d8..000000000000 --- a/drivers/mmc/host/sdhci-of.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * OpenFirmware bindings for Secure Digital Host Controller Interface. - * - * Copyright (c) 2007 Freescale Semiconductor, Inc. - * Copyright (c) 2009 MontaVista Software, Inc. - * - * Authors: Xiaobo Xie - * Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - */ - -#ifndef __SDHCI_OF_H -#define __SDHCI_OF_H - -#include -#include "sdhci.h" - -struct sdhci_of_data { - unsigned int quirks; - struct sdhci_ops ops; -}; - -struct sdhci_of_host { - unsigned int clock; - u16 xfer_mode_shadow; -}; - -extern u32 sdhci_be32bs_readl(struct sdhci_host *host, int reg); -extern u16 sdhci_be32bs_readw(struct sdhci_host *host, int reg); -extern u8 sdhci_be32bs_readb(struct sdhci_host *host, int reg); -extern void sdhci_be32bs_writel(struct sdhci_host *host, u32 val, int reg); -extern void sdhci_be32bs_writew(struct sdhci_host *host, u16 val, int reg); -extern void sdhci_be32bs_writeb(struct sdhci_host *host, u8 val, int reg); - -extern struct sdhci_of_data sdhci_esdhc; -extern struct sdhci_of_data sdhci_hlwd; - -#endif /* __SDHCI_OF_H */ diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index 936bbca19c0a..26c528648f3c 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -143,6 +143,12 @@ static const struct sdhci_pci_fixes sdhci_cafe = { SDHCI_QUIRK_BROKEN_TIMEOUT_VAL, }; +static int mrst_hc_probe_slot(struct sdhci_pci_slot *slot) +{ + slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA; + return 0; +} + /* * ADMA operation is disabled for Moorestown platform due to * hardware bugs. @@ -157,8 +163,15 @@ static int mrst_hc_probe(struct sdhci_pci_chip *chip) return 0; } +static int mfd_emmc_probe_slot(struct sdhci_pci_slot *slot) +{ + slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA; + return 0; +} + static const struct sdhci_pci_fixes sdhci_intel_mrst_hc0 = { .quirks = SDHCI_QUIRK_BROKEN_ADMA | SDHCI_QUIRK_NO_HISPD_BIT, + .probe_slot = mrst_hc_probe_slot, }; static const struct sdhci_pci_fixes sdhci_intel_mrst_hc1_hc2 = { @@ -170,8 +183,13 @@ static const struct sdhci_pci_fixes sdhci_intel_mfd_sd = { .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, }; -static const struct sdhci_pci_fixes sdhci_intel_mfd_emmc_sdio = { +static const struct sdhci_pci_fixes sdhci_intel_mfd_sdio = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, +}; + +static const struct sdhci_pci_fixes sdhci_intel_mfd_emmc = { .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .probe_slot = mfd_emmc_probe_slot, }; /* O2Micro extra registers */ @@ -682,7 +700,7 @@ static const struct pci_device_id pci_ids[] __devinitdata = { .device = PCI_DEVICE_ID_INTEL_MFD_SDIO1, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, - .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio, + .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_sdio, }, { @@ -690,7 +708,7 @@ static const struct pci_device_id pci_ids[] __devinitdata = { .device = PCI_DEVICE_ID_INTEL_MFD_SDIO2, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, - .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio, + .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_sdio, }, { @@ -698,7 +716,7 @@ static const struct pci_device_id pci_ids[] __devinitdata = { .device = PCI_DEVICE_ID_INTEL_MFD_EMMC0, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, - .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio, + .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc, }, { @@ -706,7 +724,7 @@ static const struct pci_device_id pci_ids[] __devinitdata = { .device = PCI_DEVICE_ID_INTEL_MFD_EMMC1, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, - .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio, + .driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc, }, { @@ -789,8 +807,34 @@ static int sdhci_pci_enable_dma(struct sdhci_host *host) return 0; } +static int sdhci_pci_8bit_width(struct sdhci_host *host, int width) +{ + u8 ctrl; + + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); + + switch (width) { + case MMC_BUS_WIDTH_8: + ctrl |= SDHCI_CTRL_8BITBUS; + ctrl &= ~SDHCI_CTRL_4BITBUS; + break; + case MMC_BUS_WIDTH_4: + ctrl |= SDHCI_CTRL_4BITBUS; + ctrl &= ~SDHCI_CTRL_8BITBUS; + break; + default: + ctrl &= ~(SDHCI_CTRL_8BITBUS | SDHCI_CTRL_4BITBUS); + break; + } + + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); + + return 0; +} + static struct sdhci_ops sdhci_pci_ops = { .enable_dma = sdhci_pci_enable_dma, + .platform_8bit_width = sdhci_pci_8bit_width, }; /*****************************************************************************\ diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c index dbab0407f4b6..71c0ce1f6db0 100644 --- a/drivers/mmc/host/sdhci-pltfm.c +++ b/drivers/mmc/host/sdhci-pltfm.c @@ -2,6 +2,12 @@ * sdhci-pltfm.c Support for SDHCI platform devices * Copyright (c) 2009 Intel Corporation * + * Copyright (c) 2007 Freescale Semiconductor, Inc. + * Copyright (c) 2009 MontaVista Software, Inc. + * + * Authors: Xiaobo Xie + * Anton Vorontsov + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -22,48 +28,66 @@ * Inspired by sdhci-pci.c, by Pierre Ossman */ -#include -#include -#include -#include +#include +#include +#ifdef CONFIG_PPC +#include +#endif +#include "sdhci-pltfm.h" -#include +static struct sdhci_ops sdhci_pltfm_ops = { +}; -#include -#include +#ifdef CONFIG_OF +static bool sdhci_of_wp_inverted(struct device_node *np) +{ + if (of_get_property(np, "sdhci,wp-inverted", NULL)) + return true; -#include "sdhci.h" -#include "sdhci-pltfm.h" + /* Old device trees don't have the wp-inverted property. */ +#ifdef CONFIG_PPC + return machine_is(mpc837x_rdb) || machine_is(mpc837x_mds); +#else + return false; +#endif /* CONFIG_PPC */ +} + +void sdhci_get_of_property(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct sdhci_host *host = platform_get_drvdata(pdev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + const __be32 *clk; + int size; -/*****************************************************************************\ - * * - * SDHCI core callbacks * - * * -\*****************************************************************************/ + if (of_device_is_available(np)) { + if (of_get_property(np, "sdhci,auto-cmd12", NULL)) + host->quirks |= SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12; -static struct sdhci_ops sdhci_pltfm_ops = { -}; + if (of_get_property(np, "sdhci,1-bit-only", NULL)) + host->quirks |= SDHCI_QUIRK_FORCE_1_BIT_DATA; -/*****************************************************************************\ - * * - * Device probing/removal * - * * -\*****************************************************************************/ + if (sdhci_of_wp_inverted(np)) + host->quirks |= SDHCI_QUIRK_INVERTED_WRITE_PROTECT; -static int __devinit sdhci_pltfm_probe(struct platform_device *pdev) + clk = of_get_property(np, "clock-frequency", &size); + if (clk && size == sizeof(*clk) && *clk) + pltfm_host->clock = be32_to_cpup(clk); + } +} +#else +void sdhci_get_of_property(struct platform_device *pdev) {} +#endif /* CONFIG_OF */ +EXPORT_SYMBOL_GPL(sdhci_get_of_property); + +struct sdhci_host *sdhci_pltfm_init(struct platform_device *pdev, + struct sdhci_pltfm_data *pdata) { - const struct platform_device_id *platid = platform_get_device_id(pdev); - struct sdhci_pltfm_data *pdata; struct sdhci_host *host; struct sdhci_pltfm_host *pltfm_host; struct resource *iomem; int ret; - if (platid && platid->driver_data) - pdata = (void *)platid->driver_data; - else - pdata = pdev->dev.platform_data; - iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!iomem) { ret = -ENOMEM; @@ -71,8 +95,7 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev) } if (resource_size(iomem) < 0x100) - dev_err(&pdev->dev, "Invalid iomem size. You may " - "experience problems.\n"); + dev_err(&pdev->dev, "Invalid iomem size!\n"); /* Some PCI-based MFD need the parent here */ if (pdev->dev.parent != &platform_bus) @@ -87,7 +110,7 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev) pltfm_host = sdhci_priv(host); - host->hw_name = "platform"; + host->hw_name = dev_name(&pdev->dev); if (pdata && pdata->ops) host->ops = pdata->ops; else @@ -110,126 +133,95 @@ static int __devinit sdhci_pltfm_probe(struct platform_device *pdev) goto err_remap; } - if (pdata && pdata->init) { - ret = pdata->init(host, pdata); - if (ret) - goto err_plat_init; - } - - ret = sdhci_add_host(host); - if (ret) - goto err_add_host; - platform_set_drvdata(pdev, host); - return 0; + return host; -err_add_host: - if (pdata && pdata->exit) - pdata->exit(host); -err_plat_init: - iounmap(host->ioaddr); err_remap: release_mem_region(iomem->start, resource_size(iomem)); err_request: sdhci_free_host(host); err: - printk(KERN_ERR"Probing of sdhci-pltfm failed: %d\n", ret); - return ret; + dev_err(&pdev->dev, "%s failed %d\n", __func__, ret); + return ERR_PTR(ret); } +EXPORT_SYMBOL_GPL(sdhci_pltfm_init); -static int __devexit sdhci_pltfm_remove(struct platform_device *pdev) +void sdhci_pltfm_free(struct platform_device *pdev) { - struct sdhci_pltfm_data *pdata = pdev->dev.platform_data; struct sdhci_host *host = platform_get_drvdata(pdev); struct resource *iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - int dead; - u32 scratch; - - dead = 0; - scratch = readl(host->ioaddr + SDHCI_INT_STATUS); - if (scratch == (u32)-1) - dead = 1; - sdhci_remove_host(host, dead); - if (pdata && pdata->exit) - pdata->exit(host); iounmap(host->ioaddr); release_mem_region(iomem->start, resource_size(iomem)); sdhci_free_host(host); platform_set_drvdata(pdev, NULL); +} +EXPORT_SYMBOL_GPL(sdhci_pltfm_free); - return 0; +int sdhci_pltfm_register(struct platform_device *pdev, + struct sdhci_pltfm_data *pdata) +{ + struct sdhci_host *host; + int ret = 0; + + host = sdhci_pltfm_init(pdev, pdata); + if (IS_ERR(host)) + return PTR_ERR(host); + + sdhci_get_of_property(pdev); + + ret = sdhci_add_host(host); + if (ret) + sdhci_pltfm_free(pdev); + + return ret; } +EXPORT_SYMBOL_GPL(sdhci_pltfm_register); -static const struct platform_device_id sdhci_pltfm_ids[] = { - { "sdhci", }, -#ifdef CONFIG_MMC_SDHCI_CNS3XXX - { "sdhci-cns3xxx", (kernel_ulong_t)&sdhci_cns3xxx_pdata }, -#endif -#ifdef CONFIG_MMC_SDHCI_ESDHC_IMX - { "sdhci-esdhc-imx", (kernel_ulong_t)&sdhci_esdhc_imx_pdata }, -#endif -#ifdef CONFIG_MMC_SDHCI_DOVE - { "sdhci-dove", (kernel_ulong_t)&sdhci_dove_pdata }, -#endif -#ifdef CONFIG_MMC_SDHCI_TEGRA - { "sdhci-tegra", (kernel_ulong_t)&sdhci_tegra_pdata }, -#endif - { }, -}; -MODULE_DEVICE_TABLE(platform, sdhci_pltfm_ids); +int sdhci_pltfm_unregister(struct platform_device *pdev) +{ + struct sdhci_host *host = platform_get_drvdata(pdev); + int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff); + + sdhci_remove_host(host, dead); + sdhci_pltfm_free(pdev); + + return 0; +} +EXPORT_SYMBOL_GPL(sdhci_pltfm_unregister); #ifdef CONFIG_PM -static int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state) +int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state) { struct sdhci_host *host = platform_get_drvdata(dev); return sdhci_suspend_host(host, state); } +EXPORT_SYMBOL_GPL(sdhci_pltfm_suspend); -static int sdhci_pltfm_resume(struct platform_device *dev) +int sdhci_pltfm_resume(struct platform_device *dev) { struct sdhci_host *host = platform_get_drvdata(dev); return sdhci_resume_host(host); } -#else -#define sdhci_pltfm_suspend NULL -#define sdhci_pltfm_resume NULL +EXPORT_SYMBOL_GPL(sdhci_pltfm_resume); #endif /* CONFIG_PM */ -static struct platform_driver sdhci_pltfm_driver = { - .driver = { - .name = "sdhci", - .owner = THIS_MODULE, - }, - .probe = sdhci_pltfm_probe, - .remove = __devexit_p(sdhci_pltfm_remove), - .id_table = sdhci_pltfm_ids, - .suspend = sdhci_pltfm_suspend, - .resume = sdhci_pltfm_resume, -}; - -/*****************************************************************************\ - * * - * Driver init/exit * - * * -\*****************************************************************************/ - -static int __init sdhci_drv_init(void) +static int __init sdhci_pltfm_drv_init(void) { - return platform_driver_register(&sdhci_pltfm_driver); + pr_info("sdhci-pltfm: SDHCI platform and OF driver helper\n"); + + return 0; } +module_init(sdhci_pltfm_drv_init); -static void __exit sdhci_drv_exit(void) +static void __exit sdhci_pltfm_drv_exit(void) { - platform_driver_unregister(&sdhci_pltfm_driver); } +module_exit(sdhci_pltfm_drv_exit); -module_init(sdhci_drv_init); -module_exit(sdhci_drv_exit); - -MODULE_DESCRIPTION("Secure Digital Host Controller Interface platform driver"); -MODULE_AUTHOR("Mocean Laboratories "); +MODULE_DESCRIPTION("SDHCI platform and OF driver helper"); +MODULE_AUTHOR("Intel Corporation"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-pltfm.h b/drivers/mmc/host/sdhci-pltfm.h index 2b37016ad0ac..3a9fc3f40840 100644 --- a/drivers/mmc/host/sdhci-pltfm.h +++ b/drivers/mmc/host/sdhci-pltfm.h @@ -12,17 +12,95 @@ #define _DRIVERS_MMC_SDHCI_PLTFM_H #include -#include -#include +#include +#include "sdhci.h" + +struct sdhci_pltfm_data { + struct sdhci_ops *ops; + unsigned int quirks; +}; struct sdhci_pltfm_host { struct clk *clk; void *priv; /* to handle quirks across io-accessor calls */ + + /* migrate from sdhci_of_host */ + unsigned int clock; + u16 xfer_mode_shadow; }; -extern struct sdhci_pltfm_data sdhci_cns3xxx_pdata; -extern struct sdhci_pltfm_data sdhci_esdhc_imx_pdata; -extern struct sdhci_pltfm_data sdhci_dove_pdata; -extern struct sdhci_pltfm_data sdhci_tegra_pdata; +#ifdef CONFIG_MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER +/* + * These accessors are designed for big endian hosts doing I/O to + * little endian controllers incorporating a 32-bit hardware byte swapper. + */ +static inline u32 sdhci_be32bs_readl(struct sdhci_host *host, int reg) +{ + return in_be32(host->ioaddr + reg); +} + +static inline u16 sdhci_be32bs_readw(struct sdhci_host *host, int reg) +{ + return in_be16(host->ioaddr + (reg ^ 0x2)); +} + +static inline u8 sdhci_be32bs_readb(struct sdhci_host *host, int reg) +{ + return in_8(host->ioaddr + (reg ^ 0x3)); +} + +static inline void sdhci_be32bs_writel(struct sdhci_host *host, + u32 val, int reg) +{ + out_be32(host->ioaddr + reg, val); +} + +static inline void sdhci_be32bs_writew(struct sdhci_host *host, + u16 val, int reg) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + int base = reg & ~0x3; + int shift = (reg & 0x2) * 8; + + switch (reg) { + case SDHCI_TRANSFER_MODE: + /* + * Postpone this write, we must do it together with a + * command write that is down below. + */ + pltfm_host->xfer_mode_shadow = val; + return; + case SDHCI_COMMAND: + sdhci_be32bs_writel(host, + val << 16 | pltfm_host->xfer_mode_shadow, + SDHCI_TRANSFER_MODE); + return; + } + clrsetbits_be32(host->ioaddr + base, 0xffff << shift, val << shift); +} + +static inline void sdhci_be32bs_writeb(struct sdhci_host *host, u8 val, int reg) +{ + int base = reg & ~0x3; + int shift = (reg & 0x3) * 8; + + clrsetbits_be32(host->ioaddr + base , 0xff << shift, val << shift); +} +#endif /* CONFIG_MMC_SDHCI_BIG_ENDIAN_32BIT_BYTE_SWAPPER */ + +extern void sdhci_get_of_property(struct platform_device *pdev); + +extern struct sdhci_host *sdhci_pltfm_init(struct platform_device *pdev, + struct sdhci_pltfm_data *pdata); +extern void sdhci_pltfm_free(struct platform_device *pdev); + +extern int sdhci_pltfm_register(struct platform_device *pdev, + struct sdhci_pltfm_data *pdata); +extern int sdhci_pltfm_unregister(struct platform_device *pdev); + +#ifdef CONFIG_PM +extern int sdhci_pltfm_suspend(struct platform_device *dev, pm_message_t state); +extern int sdhci_pltfm_resume(struct platform_device *dev); +#endif #endif /* _DRIVERS_MMC_SDHCI_PLTFM_H */ diff --git a/drivers/mmc/host/sdhci-pxa.c b/drivers/mmc/host/sdhci-pxa.c deleted file mode 100644 index 089c9a68b7b1..000000000000 --- a/drivers/mmc/host/sdhci-pxa.c +++ /dev/null @@ -1,303 +0,0 @@ -/* linux/drivers/mmc/host/sdhci-pxa.c - * - * Copyright (C) 2010 Marvell International Ltd. - * Zhangfei Gao - * Kevin Wang - * Mingwei Wang - * Philip Rakity - * Mark Brown - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* Supports: - * SDHCI support for MMP2/PXA910/PXA168 - * - * Refer to sdhci-s3c.c. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "sdhci.h" - -#define DRIVER_NAME "sdhci-pxa" - -#define SD_FIFO_PARAM 0x104 -#define DIS_PAD_SD_CLK_GATE 0x400 - -struct sdhci_pxa { - struct sdhci_host *host; - struct sdhci_pxa_platdata *pdata; - struct clk *clk; - struct resource *res; - - u8 clk_enable; -}; - -/*****************************************************************************\ - * * - * SDHCI core callbacks * - * * -\*****************************************************************************/ -static void set_clock(struct sdhci_host *host, unsigned int clock) -{ - struct sdhci_pxa *pxa = sdhci_priv(host); - u32 tmp = 0; - - if (clock == 0) { - if (pxa->clk_enable) { - clk_disable(pxa->clk); - pxa->clk_enable = 0; - } - } else { - if (0 == pxa->clk_enable) { - if (pxa->pdata->flags & PXA_FLAG_DISABLE_CLOCK_GATING) { - tmp = readl(host->ioaddr + SD_FIFO_PARAM); - tmp |= DIS_PAD_SD_CLK_GATE; - writel(tmp, host->ioaddr + SD_FIFO_PARAM); - } - clk_enable(pxa->clk); - pxa->clk_enable = 1; - } - } -} - -static int set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) -{ - u16 ctrl_2; - - /* - * Set V18_EN -- UHS modes do not work without this. - * does not change signaling voltage - */ - ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); - - /* Select Bus Speed Mode for host */ - ctrl_2 &= ~SDHCI_CTRL_UHS_MASK; - switch (uhs) { - case MMC_TIMING_UHS_SDR12: - ctrl_2 |= SDHCI_CTRL_UHS_SDR12; - break; - case MMC_TIMING_UHS_SDR25: - ctrl_2 |= SDHCI_CTRL_UHS_SDR25; - break; - case MMC_TIMING_UHS_SDR50: - ctrl_2 |= SDHCI_CTRL_UHS_SDR50 | SDHCI_CTRL_VDD_180; - break; - case MMC_TIMING_UHS_SDR104: - ctrl_2 |= SDHCI_CTRL_UHS_SDR104 | SDHCI_CTRL_VDD_180; - break; - case MMC_TIMING_UHS_DDR50: - ctrl_2 |= SDHCI_CTRL_UHS_DDR50 | SDHCI_CTRL_VDD_180; - break; - } - - sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); - pr_debug("%s:%s uhs = %d, ctrl_2 = %04X\n", - __func__, mmc_hostname(host->mmc), uhs, ctrl_2); - - return 0; -} - -static struct sdhci_ops sdhci_pxa_ops = { - .set_uhs_signaling = set_uhs_signaling, - .set_clock = set_clock, -}; - -/*****************************************************************************\ - * * - * Device probing/removal * - * * -\*****************************************************************************/ - -static int __devinit sdhci_pxa_probe(struct platform_device *pdev) -{ - struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; - struct device *dev = &pdev->dev; - struct sdhci_host *host = NULL; - struct resource *iomem = NULL; - struct sdhci_pxa *pxa = NULL; - int ret, irq; - - irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(dev, "no irq specified\n"); - return irq; - } - - iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!iomem) { - dev_err(dev, "no memory specified\n"); - return -ENOENT; - } - - host = sdhci_alloc_host(&pdev->dev, sizeof(struct sdhci_pxa)); - if (IS_ERR(host)) { - dev_err(dev, "failed to alloc host\n"); - return PTR_ERR(host); - } - - pxa = sdhci_priv(host); - pxa->host = host; - pxa->pdata = pdata; - pxa->clk_enable = 0; - - pxa->clk = clk_get(dev, "PXA-SDHCLK"); - if (IS_ERR(pxa->clk)) { - dev_err(dev, "failed to get io clock\n"); - ret = PTR_ERR(pxa->clk); - goto out; - } - - pxa->res = request_mem_region(iomem->start, resource_size(iomem), - mmc_hostname(host->mmc)); - if (!pxa->res) { - dev_err(&pdev->dev, "cannot request region\n"); - ret = -EBUSY; - goto out; - } - - host->ioaddr = ioremap(iomem->start, resource_size(iomem)); - if (!host->ioaddr) { - dev_err(&pdev->dev, "failed to remap registers\n"); - ret = -ENOMEM; - goto out; - } - - host->hw_name = "MMC"; - host->ops = &sdhci_pxa_ops; - host->irq = irq; - host->quirks = SDHCI_QUIRK_BROKEN_ADMA - | SDHCI_QUIRK_BROKEN_TIMEOUT_VAL - | SDHCI_QUIRK_32BIT_DMA_ADDR - | SDHCI_QUIRK_32BIT_DMA_SIZE - | SDHCI_QUIRK_32BIT_ADMA_SIZE - | SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC; - - if (pdata->quirks) - host->quirks |= pdata->quirks; - - /* enable 1/8V DDR capable */ - host->mmc->caps |= MMC_CAP_1_8V_DDR; - - /* If slot design supports 8 bit data, indicate this to MMC. */ - if (pdata->flags & PXA_FLAG_SD_8_BIT_CAPABLE_SLOT) - host->mmc->caps |= MMC_CAP_8_BIT_DATA; - - ret = sdhci_add_host(host); - if (ret) { - dev_err(&pdev->dev, "failed to add host\n"); - goto out; - } - - if (pxa->pdata->max_speed) - host->mmc->f_max = pxa->pdata->max_speed; - - platform_set_drvdata(pdev, host); - - return 0; -out: - if (host) { - clk_put(pxa->clk); - if (host->ioaddr) - iounmap(host->ioaddr); - if (pxa->res) - release_mem_region(pxa->res->start, - resource_size(pxa->res)); - sdhci_free_host(host); - } - - return ret; -} - -static int __devexit sdhci_pxa_remove(struct platform_device *pdev) -{ - struct sdhci_host *host = platform_get_drvdata(pdev); - struct sdhci_pxa *pxa = sdhci_priv(host); - int dead = 0; - u32 scratch; - - if (host) { - scratch = readl(host->ioaddr + SDHCI_INT_STATUS); - if (scratch == (u32)-1) - dead = 1; - - sdhci_remove_host(host, dead); - - if (host->ioaddr) - iounmap(host->ioaddr); - if (pxa->res) - release_mem_region(pxa->res->start, - resource_size(pxa->res)); - if (pxa->clk_enable) { - clk_disable(pxa->clk); - pxa->clk_enable = 0; - } - clk_put(pxa->clk); - - sdhci_free_host(host); - platform_set_drvdata(pdev, NULL); - } - - return 0; -} - -#ifdef CONFIG_PM -static int sdhci_pxa_suspend(struct platform_device *dev, pm_message_t state) -{ - struct sdhci_host *host = platform_get_drvdata(dev); - - return sdhci_suspend_host(host, state); -} - -static int sdhci_pxa_resume(struct platform_device *dev) -{ - struct sdhci_host *host = platform_get_drvdata(dev); - - return sdhci_resume_host(host); -} -#else -#define sdhci_pxa_suspend NULL -#define sdhci_pxa_resume NULL -#endif - -static struct platform_driver sdhci_pxa_driver = { - .probe = sdhci_pxa_probe, - .remove = __devexit_p(sdhci_pxa_remove), - .suspend = sdhci_pxa_suspend, - .resume = sdhci_pxa_resume, - .driver = { - .name = DRIVER_NAME, - .owner = THIS_MODULE, - }, -}; - -/*****************************************************************************\ - * * - * Driver init/exit * - * * -\*****************************************************************************/ - -static int __init sdhci_pxa_init(void) -{ - return platform_driver_register(&sdhci_pxa_driver); -} - -static void __exit sdhci_pxa_exit(void) -{ - platform_driver_unregister(&sdhci_pxa_driver); -} - -module_init(sdhci_pxa_init); -module_exit(sdhci_pxa_exit); - -MODULE_DESCRIPTION("SDH controller driver for PXA168/PXA910/MMP2"); -MODULE_AUTHOR("Zhangfei Gao "); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c new file mode 100644 index 000000000000..38f58994f79a --- /dev/null +++ b/drivers/mmc/host/sdhci-pxav2.c @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2010 Marvell International Ltd. + * Zhangfei Gao + * Kevin Wang + * Jun Nie + * Qiming Wu + * Philip Rakity + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sdhci.h" +#include "sdhci-pltfm.h" + +#define SD_FIFO_PARAM 0xe0 +#define DIS_PAD_SD_CLK_GATE 0x0400 /* Turn on/off Dynamic SD Clock Gating */ +#define CLK_GATE_ON 0x0200 /* Disable/enable Clock Gate */ +#define CLK_GATE_CTL 0x0100 /* Clock Gate Control */ +#define CLK_GATE_SETTING_BITS (DIS_PAD_SD_CLK_GATE | \ + CLK_GATE_ON | CLK_GATE_CTL) + +#define SD_CLOCK_BURST_SIZE_SETUP 0xe6 +#define SDCLK_SEL_SHIFT 8 +#define SDCLK_SEL_MASK 0x3 +#define SDCLK_DELAY_SHIFT 10 +#define SDCLK_DELAY_MASK 0x3c + +#define SD_CE_ATA_2 0xea +#define MMC_CARD 0x1000 +#define MMC_WIDTH 0x0100 + +static void pxav2_set_private_registers(struct sdhci_host *host, u8 mask) +{ + struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); + struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + + if (mask == SDHCI_RESET_ALL) { + u16 tmp = 0; + + /* + * tune timing of read data/command when crc error happen + * no performance impact + */ + if (pdata->clk_delay_sel == 1) { + tmp = readw(host->ioaddr + SD_CLOCK_BURST_SIZE_SETUP); + + tmp &= ~(SDCLK_DELAY_MASK << SDCLK_DELAY_SHIFT); + tmp |= (pdata->clk_delay_cycles & SDCLK_DELAY_MASK) + << SDCLK_DELAY_SHIFT; + tmp &= ~(SDCLK_SEL_MASK << SDCLK_SEL_SHIFT); + tmp |= (1 & SDCLK_SEL_MASK) << SDCLK_SEL_SHIFT; + + writew(tmp, host->ioaddr + SD_CLOCK_BURST_SIZE_SETUP); + } + + if (pdata->flags & PXA_FLAG_ENABLE_CLOCK_GATING) { + tmp = readw(host->ioaddr + SD_FIFO_PARAM); + tmp &= ~CLK_GATE_SETTING_BITS; + writew(tmp, host->ioaddr + SD_FIFO_PARAM); + } else { + tmp = readw(host->ioaddr + SD_FIFO_PARAM); + tmp &= ~CLK_GATE_SETTING_BITS; + tmp |= CLK_GATE_SETTING_BITS; + writew(tmp, host->ioaddr + SD_FIFO_PARAM); + } + } +} + +static int pxav2_mmc_set_width(struct sdhci_host *host, int width) +{ + u8 ctrl; + u16 tmp; + + ctrl = readb(host->ioaddr + SDHCI_HOST_CONTROL); + tmp = readw(host->ioaddr + SD_CE_ATA_2); + if (width == MMC_BUS_WIDTH_8) { + ctrl &= ~SDHCI_CTRL_4BITBUS; + tmp |= MMC_CARD | MMC_WIDTH; + } else { + tmp &= ~(MMC_CARD | MMC_WIDTH); + if (width == MMC_BUS_WIDTH_4) + ctrl |= SDHCI_CTRL_4BITBUS; + else + ctrl &= ~SDHCI_CTRL_4BITBUS; + } + writew(tmp, host->ioaddr + SD_CE_ATA_2); + writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); + + return 0; +} + +static u32 pxav2_get_max_clock(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + + return clk_get_rate(pltfm_host->clk); +} + +static struct sdhci_ops pxav2_sdhci_ops = { + .get_max_clock = pxav2_get_max_clock, + .platform_reset_exit = pxav2_set_private_registers, + .platform_8bit_width = pxav2_mmc_set_width, +}; + +static int __devinit sdhci_pxav2_probe(struct platform_device *pdev) +{ + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + struct device *dev = &pdev->dev; + struct sdhci_host *host = NULL; + struct sdhci_pxa *pxa = NULL; + int ret; + struct clk *clk; + + pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL); + if (!pxa) + return -ENOMEM; + + host = sdhci_pltfm_init(pdev, NULL); + if (IS_ERR(host)) { + kfree(pxa); + return PTR_ERR(host); + } + pltfm_host = sdhci_priv(host); + pltfm_host->priv = pxa; + + clk = clk_get(dev, "PXA-SDHCLK"); + if (IS_ERR(clk)) { + dev_err(dev, "failed to get io clock\n"); + ret = PTR_ERR(clk); + goto err_clk_get; + } + pltfm_host->clk = clk; + clk_enable(clk); + + host->quirks = SDHCI_QUIRK_BROKEN_ADMA + | SDHCI_QUIRK_BROKEN_TIMEOUT_VAL + | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN; + + if (pdata) { + if (pdata->flags & PXA_FLAG_CARD_PERMANENT) { + /* on-chip device */ + host->quirks |= SDHCI_QUIRK_BROKEN_CARD_DETECTION; + host->mmc->caps |= MMC_CAP_NONREMOVABLE; + } + + /* If slot design supports 8 bit data, indicate this to MMC. */ + if (pdata->flags & PXA_FLAG_SD_8_BIT_CAPABLE_SLOT) + host->mmc->caps |= MMC_CAP_8_BIT_DATA; + + if (pdata->quirks) + host->quirks |= pdata->quirks; + if (pdata->host_caps) + host->mmc->caps |= pdata->host_caps; + if (pdata->pm_caps) + host->mmc->pm_caps |= pdata->pm_caps; + } + + host->ops = &pxav2_sdhci_ops; + + ret = sdhci_add_host(host); + if (ret) { + dev_err(&pdev->dev, "failed to add host\n"); + goto err_add_host; + } + + platform_set_drvdata(pdev, host); + + return 0; + +err_add_host: + clk_disable(clk); + clk_put(clk); +err_clk_get: + sdhci_pltfm_free(pdev); + kfree(pxa); + return ret; +} + +static int __devexit sdhci_pxav2_remove(struct platform_device *pdev) +{ + struct sdhci_host *host = platform_get_drvdata(pdev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_pxa *pxa = pltfm_host->priv; + + sdhci_remove_host(host, 1); + + clk_disable(pltfm_host->clk); + clk_put(pltfm_host->clk); + sdhci_pltfm_free(pdev); + kfree(pxa); + + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static struct platform_driver sdhci_pxav2_driver = { + .driver = { + .name = "sdhci-pxav2", + .owner = THIS_MODULE, + }, + .probe = sdhci_pxav2_probe, + .remove = __devexit_p(sdhci_pxav2_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif +}; +static int __init sdhci_pxav2_init(void) +{ + return platform_driver_register(&sdhci_pxav2_driver); +} + +static void __exit sdhci_pxav2_exit(void) +{ + platform_driver_unregister(&sdhci_pxav2_driver); +} + +module_init(sdhci_pxav2_init); +module_exit(sdhci_pxav2_exit); + +MODULE_DESCRIPTION("SDHCI driver for pxav2"); +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_LICENSE("GPL v2"); + diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c new file mode 100644 index 000000000000..4198dbbc5c20 --- /dev/null +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2010 Marvell International Ltd. + * Zhangfei Gao + * Kevin Wang + * Mingwei Wang + * Philip Rakity + * Mark Brown + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sdhci.h" +#include "sdhci-pltfm.h" + +#define SD_CLOCK_BURST_SIZE_SETUP 0x10A +#define SDCLK_SEL 0x100 +#define SDCLK_DELAY_SHIFT 9 +#define SDCLK_DELAY_MASK 0x1f + +#define SD_CFG_FIFO_PARAM 0x100 +#define SDCFG_GEN_PAD_CLK_ON (1<<6) +#define SDCFG_GEN_PAD_CLK_CNT_MASK 0xFF +#define SDCFG_GEN_PAD_CLK_CNT_SHIFT 24 + +#define SD_SPI_MODE 0x108 +#define SD_CE_ATA_1 0x10C + +#define SD_CE_ATA_2 0x10E +#define SDCE_MISC_INT (1<<2) +#define SDCE_MISC_INT_EN (1<<1) + +static void pxav3_set_private_registers(struct sdhci_host *host, u8 mask) +{ + struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); + struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + + if (mask == SDHCI_RESET_ALL) { + /* + * tune timing of read data/command when crc error happen + * no performance impact + */ + if (pdata && 0 != pdata->clk_delay_cycles) { + u16 tmp; + + tmp = readw(host->ioaddr + SD_CLOCK_BURST_SIZE_SETUP); + tmp |= (pdata->clk_delay_cycles & SDCLK_DELAY_MASK) + << SDCLK_DELAY_SHIFT; + tmp |= SDCLK_SEL; + writew(tmp, host->ioaddr + SD_CLOCK_BURST_SIZE_SETUP); + } + } +} + +#define MAX_WAIT_COUNT 5 +static void pxav3_gen_init_74_clocks(struct sdhci_host *host, u8 power_mode) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_pxa *pxa = pltfm_host->priv; + u16 tmp; + int count; + + if (pxa->power_mode == MMC_POWER_UP + && power_mode == MMC_POWER_ON) { + + dev_dbg(mmc_dev(host->mmc), + "%s: slot->power_mode = %d," + "ios->power_mode = %d\n", + __func__, + pxa->power_mode, + power_mode); + + /* set we want notice of when 74 clocks are sent */ + tmp = readw(host->ioaddr + SD_CE_ATA_2); + tmp |= SDCE_MISC_INT_EN; + writew(tmp, host->ioaddr + SD_CE_ATA_2); + + /* start sending the 74 clocks */ + tmp = readw(host->ioaddr + SD_CFG_FIFO_PARAM); + tmp |= SDCFG_GEN_PAD_CLK_ON; + writew(tmp, host->ioaddr + SD_CFG_FIFO_PARAM); + + /* slowest speed is about 100KHz or 10usec per clock */ + udelay(740); + count = 0; + + while (count++ < MAX_WAIT_COUNT) { + if ((readw(host->ioaddr + SD_CE_ATA_2) + & SDCE_MISC_INT) == 0) + break; + udelay(10); + } + + if (count == MAX_WAIT_COUNT) + dev_warn(mmc_dev(host->mmc), "74 clock interrupt not cleared\n"); + + /* clear the interrupt bit if posted */ + tmp = readw(host->ioaddr + SD_CE_ATA_2); + tmp |= SDCE_MISC_INT; + writew(tmp, host->ioaddr + SD_CE_ATA_2); + } + pxa->power_mode = power_mode; +} + +static int pxav3_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) +{ + u16 ctrl_2; + + /* + * Set V18_EN -- UHS modes do not work without this. + * does not change signaling voltage + */ + ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + + /* Select Bus Speed Mode for host */ + ctrl_2 &= ~SDHCI_CTRL_UHS_MASK; + switch (uhs) { + case MMC_TIMING_UHS_SDR12: + ctrl_2 |= SDHCI_CTRL_UHS_SDR12; + break; + case MMC_TIMING_UHS_SDR25: + ctrl_2 |= SDHCI_CTRL_UHS_SDR25; + break; + case MMC_TIMING_UHS_SDR50: + ctrl_2 |= SDHCI_CTRL_UHS_SDR50 | SDHCI_CTRL_VDD_180; + break; + case MMC_TIMING_UHS_SDR104: + ctrl_2 |= SDHCI_CTRL_UHS_SDR104 | SDHCI_CTRL_VDD_180; + break; + case MMC_TIMING_UHS_DDR50: + ctrl_2 |= SDHCI_CTRL_UHS_DDR50 | SDHCI_CTRL_VDD_180; + break; + } + + sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); + dev_dbg(mmc_dev(host->mmc), + "%s uhs = %d, ctrl_2 = %04X\n", + __func__, uhs, ctrl_2); + + return 0; +} + +static struct sdhci_ops pxav3_sdhci_ops = { + .platform_reset_exit = pxav3_set_private_registers, + .set_uhs_signaling = pxav3_set_uhs_signaling, + .platform_send_init_74_clocks = pxav3_gen_init_74_clocks, +}; + +static int __devinit sdhci_pxav3_probe(struct platform_device *pdev) +{ + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + struct device *dev = &pdev->dev; + struct sdhci_host *host = NULL; + struct sdhci_pxa *pxa = NULL; + int ret; + struct clk *clk; + + pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL); + if (!pxa) + return -ENOMEM; + + host = sdhci_pltfm_init(pdev, NULL); + if (IS_ERR(host)) { + kfree(pxa); + return PTR_ERR(host); + } + pltfm_host = sdhci_priv(host); + pltfm_host->priv = pxa; + + clk = clk_get(dev, "PXA-SDHCLK"); + if (IS_ERR(clk)) { + dev_err(dev, "failed to get io clock\n"); + ret = PTR_ERR(clk); + goto err_clk_get; + } + pltfm_host->clk = clk; + clk_enable(clk); + + host->quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL + | SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC; + + /* enable 1/8V DDR capable */ + host->mmc->caps |= MMC_CAP_1_8V_DDR; + + if (pdata) { + if (pdata->flags & PXA_FLAG_CARD_PERMANENT) { + /* on-chip device */ + host->quirks |= SDHCI_QUIRK_BROKEN_CARD_DETECTION; + host->mmc->caps |= MMC_CAP_NONREMOVABLE; + } + + /* If slot design supports 8 bit data, indicate this to MMC. */ + if (pdata->flags & PXA_FLAG_SD_8_BIT_CAPABLE_SLOT) + host->mmc->caps |= MMC_CAP_8_BIT_DATA; + + if (pdata->quirks) + host->quirks |= pdata->quirks; + if (pdata->host_caps) + host->mmc->caps |= pdata->host_caps; + if (pdata->pm_caps) + host->mmc->pm_caps |= pdata->pm_caps; + } + + host->ops = &pxav3_sdhci_ops; + + ret = sdhci_add_host(host); + if (ret) { + dev_err(&pdev->dev, "failed to add host\n"); + goto err_add_host; + } + + platform_set_drvdata(pdev, host); + + return 0; + +err_add_host: + clk_disable(clk); + clk_put(clk); +err_clk_get: + sdhci_pltfm_free(pdev); + kfree(pxa); + return ret; +} + +static int __devexit sdhci_pxav3_remove(struct platform_device *pdev) +{ + struct sdhci_host *host = platform_get_drvdata(pdev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_pxa *pxa = pltfm_host->priv; + + sdhci_remove_host(host, 1); + + clk_disable(pltfm_host->clk); + clk_put(pltfm_host->clk); + sdhci_pltfm_free(pdev); + kfree(pxa); + + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static struct platform_driver sdhci_pxav3_driver = { + .driver = { + .name = "sdhci-pxav3", + .owner = THIS_MODULE, + }, + .probe = sdhci_pxav3_probe, + .remove = __devexit_p(sdhci_pxav3_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif +}; +static int __init sdhci_pxav3_init(void) +{ + return platform_driver_register(&sdhci_pxav3_driver); +} + +static void __exit sdhci_pxav3_exit(void) +{ + platform_driver_unregister(&sdhci_pxav3_driver); +} + +module_init(sdhci_pxav3_init); +module_exit(sdhci_pxav3_exit); + +MODULE_DESCRIPTION("SDHCI driver for pxav3"); +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_LICENSE("GPL v2"); + diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index 69e3ee321eb5..460ffaf0f6d7 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -612,16 +612,14 @@ static int sdhci_s3c_suspend(struct platform_device *dev, pm_message_t pm) { struct sdhci_host *host = platform_get_drvdata(dev); - sdhci_suspend_host(host, pm); - return 0; + return sdhci_suspend_host(host, pm); } static int sdhci_s3c_resume(struct platform_device *dev) { struct sdhci_host *host = platform_get_drvdata(dev); - sdhci_resume_host(host); - return 0; + return sdhci_resume_host(host); } #else diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 343c97edba32..18b0bd31de78 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -24,7 +24,6 @@ #include #include -#include "sdhci.h" #include "sdhci-pltfm.h" static u32 tegra_sdhci_readl(struct sdhci_host *host, int reg) @@ -116,20 +115,42 @@ static int tegra_sdhci_8bit(struct sdhci_host *host, int bus_width) return 0; } +static struct sdhci_ops tegra_sdhci_ops = { + .get_ro = tegra_sdhci_get_ro, + .read_l = tegra_sdhci_readl, + .read_w = tegra_sdhci_readw, + .write_l = tegra_sdhci_writel, + .platform_8bit_width = tegra_sdhci_8bit, +}; + +static struct sdhci_pltfm_data sdhci_tegra_pdata = { + .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | + SDHCI_QUIRK_SINGLE_POWER_WRITE | + SDHCI_QUIRK_NO_HISPD_BIT | + SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, + .ops = &tegra_sdhci_ops, +}; -static int tegra_sdhci_pltfm_init(struct sdhci_host *host, - struct sdhci_pltfm_data *pdata) +static int __devinit sdhci_tegra_probe(struct platform_device *pdev) { - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); + struct sdhci_pltfm_host *pltfm_host; struct tegra_sdhci_platform_data *plat; + struct sdhci_host *host; struct clk *clk; int rc; + host = sdhci_pltfm_init(pdev, &sdhci_tegra_pdata); + if (IS_ERR(host)) + return PTR_ERR(host); + + pltfm_host = sdhci_priv(host); + plat = pdev->dev.platform_data; + if (plat == NULL) { dev_err(mmc_dev(host->mmc), "missing platform data\n"); - return -ENXIO; + rc = -ENXIO; + goto err_no_plat; } if (gpio_is_valid(plat->power_gpio)) { @@ -137,7 +158,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (rc) { dev_err(mmc_dev(host->mmc), "failed to allocate power gpio\n"); - goto out; + goto err_power_req; } tegra_gpio_enable(plat->power_gpio); gpio_direction_output(plat->power_gpio, 1); @@ -148,7 +169,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (rc) { dev_err(mmc_dev(host->mmc), "failed to allocate cd gpio\n"); - goto out_power; + goto err_cd_req; } tegra_gpio_enable(plat->cd_gpio); gpio_direction_input(plat->cd_gpio); @@ -159,7 +180,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (rc) { dev_err(mmc_dev(host->mmc), "request irq error\n"); - goto out_cd; + goto err_cd_irq_req; } } @@ -169,7 +190,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (rc) { dev_err(mmc_dev(host->mmc), "failed to allocate wp gpio\n"); - goto out_irq; + goto err_wp_req; } tegra_gpio_enable(plat->wp_gpio); gpio_direction_input(plat->wp_gpio); @@ -179,7 +200,7 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (IS_ERR(clk)) { dev_err(mmc_dev(host->mmc), "clk err\n"); rc = PTR_ERR(clk); - goto out_wp; + goto err_clk_get; } clk_enable(clk); pltfm_host->clk = clk; @@ -189,38 +210,47 @@ static int tegra_sdhci_pltfm_init(struct sdhci_host *host, if (plat->is_8bit) host->mmc->caps |= MMC_CAP_8_BIT_DATA; + rc = sdhci_add_host(host); + if (rc) + goto err_add_host; + return 0; -out_wp: +err_add_host: + clk_disable(pltfm_host->clk); + clk_put(pltfm_host->clk); +err_clk_get: if (gpio_is_valid(plat->wp_gpio)) { tegra_gpio_disable(plat->wp_gpio); gpio_free(plat->wp_gpio); } - -out_irq: +err_wp_req: if (gpio_is_valid(plat->cd_gpio)) free_irq(gpio_to_irq(plat->cd_gpio), host); -out_cd: +err_cd_irq_req: if (gpio_is_valid(plat->cd_gpio)) { tegra_gpio_disable(plat->cd_gpio); gpio_free(plat->cd_gpio); } - -out_power: +err_cd_req: if (gpio_is_valid(plat->power_gpio)) { tegra_gpio_disable(plat->power_gpio); gpio_free(plat->power_gpio); } - -out: +err_power_req: +err_no_plat: + sdhci_pltfm_free(pdev); return rc; } -static void tegra_sdhci_pltfm_exit(struct sdhci_host *host) +static int __devexit sdhci_tegra_remove(struct platform_device *pdev) { + struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); struct tegra_sdhci_platform_data *plat; + int dead = (readl(host->ioaddr + SDHCI_INT_STATUS) == 0xffffffff); + + sdhci_remove_host(host, dead); plat = pdev->dev.platform_data; @@ -242,22 +272,37 @@ static void tegra_sdhci_pltfm_exit(struct sdhci_host *host) clk_disable(pltfm_host->clk); clk_put(pltfm_host->clk); + + sdhci_pltfm_free(pdev); + + return 0; } -static struct sdhci_ops tegra_sdhci_ops = { - .get_ro = tegra_sdhci_get_ro, - .read_l = tegra_sdhci_readl, - .read_w = tegra_sdhci_readw, - .write_l = tegra_sdhci_writel, - .platform_8bit_width = tegra_sdhci_8bit, +static struct platform_driver sdhci_tegra_driver = { + .driver = { + .name = "sdhci-tegra", + .owner = THIS_MODULE, + }, + .probe = sdhci_tegra_probe, + .remove = __devexit_p(sdhci_tegra_remove), +#ifdef CONFIG_PM + .suspend = sdhci_pltfm_suspend, + .resume = sdhci_pltfm_resume, +#endif }; -struct sdhci_pltfm_data sdhci_tegra_pdata = { - .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | - SDHCI_QUIRK_SINGLE_POWER_WRITE | - SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, - .ops = &tegra_sdhci_ops, - .init = tegra_sdhci_pltfm_init, - .exit = tegra_sdhci_pltfm_exit, -}; +static int __init sdhci_tegra_init(void) +{ + return platform_driver_register(&sdhci_tegra_driver); +} +module_init(sdhci_tegra_init); + +static void __exit sdhci_tegra_exit(void) +{ + platform_driver_unregister(&sdhci_tegra_driver); +} +module_exit(sdhci_tegra_exit); + +MODULE_DESCRIPTION("SDHCI driver for Tegra"); +MODULE_AUTHOR(" Google, Inc."); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 58d5436ff649..c31a3343340d 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -127,11 +127,15 @@ static void sdhci_mask_irqs(struct sdhci_host *host, u32 irqs) static void sdhci_set_card_detection(struct sdhci_host *host, bool enable) { - u32 irqs = SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT; + u32 present, irqs; if (host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) return; + present = sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT; + irqs = present ? SDHCI_INT_CARD_REMOVE : SDHCI_INT_CARD_INSERT; + if (enable) sdhci_unmask_irqs(host, irqs); else @@ -2154,13 +2158,30 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) mmc_hostname(host->mmc), intmask); if (intmask & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) { + u32 present = sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT; + + /* + * There is a observation on i.mx esdhc. INSERT bit will be + * immediately set again when it gets cleared, if a card is + * inserted. We have to mask the irq to prevent interrupt + * storm which will freeze the system. And the REMOVE gets + * the same situation. + * + * More testing are needed here to ensure it works for other + * platforms though. + */ + sdhci_mask_irqs(host, present ? SDHCI_INT_CARD_INSERT : + SDHCI_INT_CARD_REMOVE); + sdhci_unmask_irqs(host, present ? SDHCI_INT_CARD_REMOVE : + SDHCI_INT_CARD_INSERT); + sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT | - SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS); + SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS); + intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE); tasklet_schedule(&host->card_tasklet); } - intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE); - if (intmask & SDHCI_INT_CMD_MASK) { sdhci_writel(host, intmask & SDHCI_INT_CMD_MASK, SDHCI_INT_STATUS); @@ -2488,6 +2509,11 @@ int sdhci_add_host(struct sdhci_host *host) } else mmc->f_min = host->max_clk / SDHCI_MAX_DIV_SPEC_200; + if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK) + mmc->max_discard_to = (1 << 27) / (mmc->f_max / 1000); + else + mmc->max_discard_to = (1 << 27) / host->timeout_clk; + mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_ERASE | MMC_CAP_CMD23; if (host->quirks & SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 14f8edbaa195..557886bee9ce 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -175,6 +175,7 @@ struct sh_mmcif_host { enum mmcif_state state; spinlock_t lock; bool power; + bool card_present; /* DMA support */ struct dma_chan *chan_rx; @@ -877,23 +878,23 @@ static void sh_mmcif_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) spin_unlock_irqrestore(&host->lock, flags); if (ios->power_mode == MMC_POWER_UP) { - if (p->set_pwr) - p->set_pwr(host->pd, ios->power_mode); - if (!host->power) { + if (!host->card_present) { /* See if we also get DMA */ sh_mmcif_request_dma(host, host->pd->dev.platform_data); - pm_runtime_get_sync(&host->pd->dev); - host->power = true; + host->card_present = true; } } else if (ios->power_mode == MMC_POWER_OFF || !ios->clock) { /* clock stop */ sh_mmcif_clock_control(host, 0); if (ios->power_mode == MMC_POWER_OFF) { - if (host->power) { - pm_runtime_put(&host->pd->dev); + if (host->card_present) { sh_mmcif_release_dma(host); - host->power = false; + host->card_present = false; } + } + if (host->power) { + pm_runtime_put(&host->pd->dev); + host->power = false; if (p->down_pwr) p->down_pwr(host->pd); } @@ -901,8 +902,16 @@ static void sh_mmcif_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) return; } - if (ios->clock) + if (ios->clock) { + if (!host->power) { + if (p->set_pwr) + p->set_pwr(host->pd, ios->power_mode); + pm_runtime_get_sync(&host->pd->dev); + host->power = true; + sh_mmcif_sync_reset(host); + } sh_mmcif_clock_control(host, ios->clock); + } host->bus_width = ios->bus_width; host->state = STATE_IDLE; diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index ce500f03df85..774f6439d7ce 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "tmio_mmc.h" @@ -55,6 +56,39 @@ static int sh_mobile_sdhi_get_cd(struct platform_device *pdev) return -ENOSYS; } +static int sh_mobile_sdhi_wait_idle(struct tmio_mmc_host *host) +{ + int timeout = 1000; + + while (--timeout && !(sd_ctrl_read16(host, CTL_STATUS2) & (1 << 13))) + udelay(1); + + if (!timeout) { + dev_warn(host->pdata->dev, "timeout waiting for SD bus idle\n"); + return -EBUSY; + } + + return 0; +} + +static int sh_mobile_sdhi_write16_hook(struct tmio_mmc_host *host, int addr) +{ + switch (addr) + { + case CTL_SD_CMD: + case CTL_STOP_INTERNAL_ACTION: + case CTL_XFER_BLK_COUNT: + case CTL_SD_CARD_CLK_CTL: + case CTL_SD_XFER_LEN: + case CTL_SD_MEM_CARD_OPT: + case CTL_TRANSACTION_CTL: + case CTL_DMA_ENABLE: + return sh_mobile_sdhi_wait_idle(host); + } + + return 0; +} + static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev) { struct sh_mobile_sdhi *priv; @@ -86,6 +120,8 @@ static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev) mmc_data->hclk = clk_get_rate(priv->clk); mmc_data->set_pwr = sh_mobile_sdhi_set_pwr; mmc_data->get_cd = sh_mobile_sdhi_get_cd; + if (mmc_data->flags & TMIO_MMC_HAS_IDLE_WAIT) + mmc_data->write16_hook = sh_mobile_sdhi_write16_hook; mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED; if (p) { mmc_data->flags = p->tmio_flags; diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 8260bc2c34e3..087d88023ba1 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -52,6 +53,8 @@ struct tmio_mmc_host { void (*set_clk_div)(struct platform_device *host, int state); int pm_error; + /* recognise system-wide suspend in runtime PM methods */ + bool pm_global; /* pio related stuff */ struct scatterlist *sg_ptr; @@ -73,8 +76,11 @@ struct tmio_mmc_host { /* Track lost interrupts */ struct delayed_work delayed_reset_work; - spinlock_t lock; + struct work_struct done; + + spinlock_t lock; /* protect host private data */ unsigned long last_req_ts; + struct mutex ios_lock; /* protect set_ios() context */ }; int tmio_mmc_host_probe(struct tmio_mmc_host **host, @@ -103,6 +109,7 @@ static inline void tmio_mmc_kunmap_atomic(struct scatterlist *sg, #if defined(CONFIG_MMC_SDHI) || defined(CONFIG_MMC_SDHI_MODULE) void tmio_mmc_start_dma(struct tmio_mmc_host *host, struct mmc_data *data); +void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable); void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdata); void tmio_mmc_release_dma(struct tmio_mmc_host *host); #else @@ -111,6 +118,10 @@ static inline void tmio_mmc_start_dma(struct tmio_mmc_host *host, { } +static inline void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable) +{ +} + static inline void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdata) { @@ -134,4 +145,44 @@ int tmio_mmc_host_resume(struct device *dev); int tmio_mmc_host_runtime_suspend(struct device *dev); int tmio_mmc_host_runtime_resume(struct device *dev); +static inline u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr) +{ + return readw(host->ctl + (addr << host->bus_shift)); +} + +static inline void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr, + u16 *buf, int count) +{ + readsw(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr) +{ + return readw(host->ctl + (addr << host->bus_shift)) | + readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16; +} + +static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) +{ + /* If there is a hook and it returns non-zero then there + * is an error and the write should be skipped + */ + if (host->pdata->write16_hook && host->pdata->write16_hook(host, addr)) + return; + writew(val, host->ctl + (addr << host->bus_shift)); +} + +static inline void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr, + u16 *buf, int count) +{ + writesw(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val) +{ + writew(val, host->ctl + (addr << host->bus_shift)); + writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift)); +} + + #endif diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index bf12513d6297..86f259cdfcbc 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -23,11 +23,14 @@ #define TMIO_MMC_MIN_DMA_LEN 8 -static void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable) +void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable) { + if (!host->chan_tx || !host->chan_rx) + return; + #if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE) /* Switch DMA mode on or off - SuperH specific? */ - writew(enable ? 2 : 0, host->ctl + (0xd8 << host->bus_shift)); + sd_ctrl_write16(host, CTL_DMA_ENABLE, enable ? 2 : 0); #endif } diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index 0b09e8239aa0..1f16357e7301 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -46,40 +46,6 @@ #include "tmio_mmc.h" -static u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr) -{ - return readw(host->ctl + (addr << host->bus_shift)); -} - -static void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr, - u16 *buf, int count) -{ - readsw(host->ctl + (addr << host->bus_shift), buf, count); -} - -static u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr) -{ - return readw(host->ctl + (addr << host->bus_shift)) | - readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16; -} - -static void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) -{ - writew(val, host->ctl + (addr << host->bus_shift)); -} - -static void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr, - u16 *buf, int count) -{ - writesw(host->ctl + (addr << host->bus_shift), buf, count); -} - -static void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val) -{ - writew(val, host->ctl + (addr << host->bus_shift)); - writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift)); -} - void tmio_mmc_enable_mmc_irqs(struct tmio_mmc_host *host, u32 i) { u32 mask = sd_ctrl_read32(host, CTL_IRQ_MASK) & ~(i & TMIO_MASK_IRQ); @@ -284,10 +250,16 @@ static void tmio_mmc_reset_work(struct work_struct *work) /* called with host->lock held, interrupts disabled */ static void tmio_mmc_finish_request(struct tmio_mmc_host *host) { - struct mmc_request *mrq = host->mrq; + struct mmc_request *mrq; + unsigned long flags; + + spin_lock_irqsave(&host->lock, flags); - if (!mrq) + mrq = host->mrq; + if (IS_ERR_OR_NULL(mrq)) { + spin_unlock_irqrestore(&host->lock, flags); return; + } host->cmd = NULL; host->data = NULL; @@ -296,11 +268,18 @@ static void tmio_mmc_finish_request(struct tmio_mmc_host *host) cancel_delayed_work(&host->delayed_reset_work); host->mrq = NULL; + spin_unlock_irqrestore(&host->lock, flags); - /* FIXME: mmc_request_done() can schedule! */ mmc_request_done(host->mmc, mrq); } +static void tmio_mmc_done_work(struct work_struct *work) +{ + struct tmio_mmc_host *host = container_of(work, struct tmio_mmc_host, + done); + tmio_mmc_finish_request(host); +} + /* These are the bitmasks the tmio chip requires to implement the MMC response * types. Note that R1 and R6 are the same in this scheme. */ #define APP_CMD 0x0040 @@ -467,7 +446,7 @@ void tmio_mmc_do_data_irq(struct tmio_mmc_host *host) BUG(); } - tmio_mmc_finish_request(host); + schedule_work(&host->done); } static void tmio_mmc_data_irq(struct tmio_mmc_host *host) @@ -557,7 +536,7 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host, tasklet_schedule(&host->dma_issue); } } else { - tmio_mmc_finish_request(host); + schedule_work(&host->done); } out: @@ -567,6 +546,7 @@ out: irqreturn_t tmio_mmc_irq(int irq, void *devid) { struct tmio_mmc_host *host = devid; + struct mmc_host *mmc = host->mmc; struct tmio_mmc_data *pdata = host->pdata; unsigned int ireg, irq_mask, status; unsigned int sdio_ireg, sdio_irq_mask, sdio_status; @@ -588,13 +568,13 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) if (sdio_ireg && !host->sdio_irq_enabled) { pr_warning("tmio_mmc: Spurious SDIO IRQ, disabling! 0x%04x 0x%04x 0x%04x\n", sdio_status, sdio_irq_mask, sdio_ireg); - tmio_mmc_enable_sdio_irq(host->mmc, 0); + tmio_mmc_enable_sdio_irq(mmc, 0); goto out; } - if (host->mmc->caps & MMC_CAP_SDIO_IRQ && + if (mmc->caps & MMC_CAP_SDIO_IRQ && sdio_ireg & TMIO_SDIO_STAT_IOIRQ) - mmc_signal_sdio_irq(host->mmc); + mmc_signal_sdio_irq(mmc); if (sdio_ireg) goto out; @@ -603,58 +583,49 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) pr_debug_status(status); pr_debug_status(ireg); - if (!ireg) { - tmio_mmc_disable_mmc_irqs(host, status & ~irq_mask); - - pr_warning("tmio_mmc: Spurious irq, disabling! " - "0x%08x 0x%08x 0x%08x\n", status, irq_mask, ireg); - pr_debug_status(status); - + /* Card insert / remove attempts */ + if (ireg & (TMIO_STAT_CARD_INSERT | TMIO_STAT_CARD_REMOVE)) { + tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_CARD_INSERT | + TMIO_STAT_CARD_REMOVE); + if ((((ireg & TMIO_STAT_CARD_REMOVE) && mmc->card) || + ((ireg & TMIO_STAT_CARD_INSERT) && !mmc->card)) && + !work_pending(&mmc->detect.work)) + mmc_detect_change(host->mmc, msecs_to_jiffies(100)); goto out; } - while (ireg) { - /* Card insert / remove attempts */ - if (ireg & (TMIO_STAT_CARD_INSERT | TMIO_STAT_CARD_REMOVE)) { - tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_CARD_INSERT | - TMIO_STAT_CARD_REMOVE); - mmc_detect_change(host->mmc, msecs_to_jiffies(100)); - } - - /* CRC and other errors */ -/* if (ireg & TMIO_STAT_ERR_IRQ) - * handled |= tmio_error_irq(host, irq, stat); + /* CRC and other errors */ +/* if (ireg & TMIO_STAT_ERR_IRQ) + * handled |= tmio_error_irq(host, irq, stat); */ - /* Command completion */ - if (ireg & (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT)) { - tmio_mmc_ack_mmc_irqs(host, - TMIO_STAT_CMDRESPEND | - TMIO_STAT_CMDTIMEOUT); - tmio_mmc_cmd_irq(host, status); - } - - /* Data transfer */ - if (ireg & (TMIO_STAT_RXRDY | TMIO_STAT_TXRQ)) { - tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_RXRDY | TMIO_STAT_TXRQ); - tmio_mmc_pio_irq(host); - } - - /* Data transfer completion */ - if (ireg & TMIO_STAT_DATAEND) { - tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_DATAEND); - tmio_mmc_data_irq(host); - } + /* Command completion */ + if (ireg & (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT)) { + tmio_mmc_ack_mmc_irqs(host, + TMIO_STAT_CMDRESPEND | + TMIO_STAT_CMDTIMEOUT); + tmio_mmc_cmd_irq(host, status); + goto out; + } - /* Check status - keep going until we've handled it all */ - status = sd_ctrl_read32(host, CTL_STATUS); - irq_mask = sd_ctrl_read32(host, CTL_IRQ_MASK); - ireg = status & TMIO_MASK_IRQ & ~irq_mask; + /* Data transfer */ + if (ireg & (TMIO_STAT_RXRDY | TMIO_STAT_TXRQ)) { + tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_RXRDY | TMIO_STAT_TXRQ); + tmio_mmc_pio_irq(host); + goto out; + } - pr_debug("Status at end of loop: %08x\n", status); - pr_debug_status(status); + /* Data transfer completion */ + if (ireg & TMIO_STAT_DATAEND) { + tmio_mmc_ack_mmc_irqs(host, TMIO_STAT_DATAEND); + tmio_mmc_data_irq(host); + goto out; } - pr_debug("MMC IRQ end\n"); + + pr_warning("tmio_mmc: Spurious irq, disabling! " + "0x%08x 0x%08x 0x%08x\n", status, irq_mask, ireg); + pr_debug_status(status); + tmio_mmc_disable_mmc_irqs(host, status & ~irq_mask); out: return IRQ_HANDLED; @@ -749,6 +720,8 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct tmio_mmc_data *pdata = host->pdata; unsigned long flags; + mutex_lock(&host->ios_lock); + spin_lock_irqsave(&host->lock, flags); if (host->mrq) { if (IS_ERR(host->mrq)) { @@ -764,6 +737,8 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->mrq->cmd->opcode, host->last_req_ts, jiffies); } spin_unlock_irqrestore(&host->lock, flags); + + mutex_unlock(&host->ios_lock); return; } @@ -771,33 +746,30 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) spin_unlock_irqrestore(&host->lock, flags); - if (ios->clock) - tmio_mmc_set_clock(host, ios->clock); - - /* Power sequence - OFF -> UP -> ON */ - if (ios->power_mode == MMC_POWER_UP) { - if ((pdata->flags & TMIO_MMC_HAS_COLD_CD) && !pdata->power) { + /* + * pdata->power == false only if COLD_CD is available, otherwise only + * in short time intervals during probing or resuming + */ + if (ios->power_mode == MMC_POWER_ON && ios->clock) { + if (!pdata->power) { pm_runtime_get_sync(&host->pdev->dev); pdata->power = true; } + tmio_mmc_set_clock(host, ios->clock); /* power up SD bus */ if (host->set_pwr) host->set_pwr(host->pdev, 1); - } else if (ios->power_mode == MMC_POWER_OFF || !ios->clock) { - /* power down SD bus */ - if (ios->power_mode == MMC_POWER_OFF) { - if (host->set_pwr) - host->set_pwr(host->pdev, 0); - if ((pdata->flags & TMIO_MMC_HAS_COLD_CD) && - pdata->power) { - pdata->power = false; - pm_runtime_put(&host->pdev->dev); - } - } - tmio_mmc_clk_stop(host); - } else { /* start bus clock */ tmio_mmc_clk_start(host); + } else if (ios->power_mode != MMC_POWER_UP) { + if (host->set_pwr) + host->set_pwr(host->pdev, 0); + if ((pdata->flags & TMIO_MMC_HAS_COLD_CD) && + pdata->power) { + pdata->power = false; + pm_runtime_put(&host->pdev->dev); + } + tmio_mmc_clk_stop(host); } switch (ios->bus_width) { @@ -817,6 +789,8 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) current->comm, task_pid_nr(current), ios->clock, ios->power_mode); host->mrq = NULL; + + mutex_unlock(&host->ios_lock); } static int tmio_mmc_get_ro(struct mmc_host *mmc) @@ -913,16 +887,20 @@ int __devinit tmio_mmc_host_probe(struct tmio_mmc_host **host, tmio_mmc_enable_sdio_irq(mmc, 0); spin_lock_init(&_host->lock); + mutex_init(&_host->ios_lock); /* Init delayed work for request timeouts */ INIT_DELAYED_WORK(&_host->delayed_reset_work, tmio_mmc_reset_work); + INIT_WORK(&_host->done, tmio_mmc_done_work); /* See if we also get DMA */ tmio_mmc_request_dma(_host, pdata); /* We have to keep the device powered for its card detection to work */ - if (!(pdata->flags & TMIO_MMC_HAS_COLD_CD)) + if (!(pdata->flags & TMIO_MMC_HAS_COLD_CD)) { + pdata->power = true; pm_runtime_get_noresume(&pdev->dev); + } mmc_add_host(mmc); @@ -963,6 +941,7 @@ void tmio_mmc_host_remove(struct tmio_mmc_host *host) pm_runtime_get_sync(&pdev->dev); mmc_remove_host(host->mmc); + cancel_work_sync(&host->done); cancel_delayed_work_sync(&host->delayed_reset_work); tmio_mmc_release_dma(host); @@ -998,11 +977,16 @@ int tmio_mmc_host_resume(struct device *dev) /* The MMC core will perform the complete set up */ host->pdata->power = false; + host->pm_global = true; if (!host->pm_error) pm_runtime_get_sync(dev); - tmio_mmc_reset(mmc_priv(mmc)); - tmio_mmc_request_dma(host, host->pdata); + if (host->pm_global) { + /* Runtime PM resume callback didn't run */ + tmio_mmc_reset(host); + tmio_mmc_enable_dma(host, true); + host->pm_global = false; + } return mmc_resume_host(mmc); } @@ -1023,12 +1007,15 @@ int tmio_mmc_host_runtime_resume(struct device *dev) struct tmio_mmc_data *pdata = host->pdata; tmio_mmc_reset(host); + tmio_mmc_enable_dma(host, true); if (pdata->power) { /* Only entered after a card-insert interrupt */ - tmio_mmc_set_ios(mmc, &mmc->ios); + if (!mmc->card) + tmio_mmc_set_ios(mmc, &mmc->ios); mmc_detect_change(mmc, msecs_to_jiffies(100)); } + host->pm_global = false; return 0; } diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 65626c1c446d..6c3fb5ab20f5 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -953,10 +953,14 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset) if (!ubi->peb_buf2) goto out_free; + err = ubi_debugging_init_dev(ubi); + if (err) + goto out_free; + err = attach_by_scanning(ubi); if (err) { dbg_err("failed to attach by scanning, error %d", err); - goto out_free; + goto out_debugging; } if (ubi->autoresize_vol_id != -1) { @@ -969,12 +973,16 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset) if (err) goto out_detach; + err = ubi_debugfs_init_dev(ubi); + if (err) + goto out_uif; + ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name); if (IS_ERR(ubi->bgt_thread)) { err = PTR_ERR(ubi->bgt_thread); ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name, err); - goto out_uif; + goto out_debugfs; } ubi_msg("attached mtd%d to ubi%d", mtd->index, ubi_num); @@ -1008,12 +1016,18 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset) ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; +out_debugfs: + ubi_debugfs_exit_dev(ubi); out_uif: + get_device(&ubi->dev); + ubi_assert(ref); uif_close(ubi); out_detach: ubi_wl_close(ubi); free_internal_volumes(ubi); vfree(ubi->vtbl); +out_debugging: + ubi_debugging_exit_dev(ubi); out_free: vfree(ubi->peb_buf1); vfree(ubi->peb_buf2); @@ -1080,11 +1094,13 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway) */ get_device(&ubi->dev); + ubi_debugfs_exit_dev(ubi); uif_close(ubi); ubi_wl_close(ubi); free_internal_volumes(ubi); vfree(ubi->vtbl); put_mtd_device(ubi->mtd); + ubi_debugging_exit_dev(ubi); vfree(ubi->peb_buf1); vfree(ubi->peb_buf2); ubi_msg("mtd%d is detached from ubi%d", ubi->mtd->index, ubi->ubi_num); @@ -1199,6 +1215,11 @@ static int __init ubi_init(void) if (!ubi_wl_entry_slab) goto out_dev_unreg; + err = ubi_debugfs_init(); + if (err) + goto out_slab; + + /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; @@ -1247,6 +1268,8 @@ out_detach: ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } + ubi_debugfs_exit(); +out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: misc_deregister(&ubi_ctrl_cdev); @@ -1270,6 +1293,7 @@ static void __exit ubi_exit(void) ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } + ubi_debugfs_exit(); kmem_cache_destroy(ubi_wl_entry_slab); misc_deregister(&ubi_ctrl_cdev); class_remove_file(ubi_class, &ubi_version); diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index 2224cbe41ddf..ab80c0debac8 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -27,17 +27,9 @@ #ifdef CONFIG_MTD_UBI_DEBUG #include "ubi.h" +#include +#include #include -#include - -unsigned int ubi_chk_flags; -unsigned int ubi_tst_flags; - -module_param_named(debug_chks, ubi_chk_flags, uint, S_IRUGO | S_IWUSR); -module_param_named(debug_tsts, ubi_chk_flags, uint, S_IRUGO | S_IWUSR); - -MODULE_PARM_DESC(debug_chks, "Debug check flags"); -MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); /** * ubi_dbg_dump_ec_hdr - dump an erase counter header. @@ -239,4 +231,261 @@ out: return; } +/** + * ubi_debugging_init_dev - initialize debugging for an UBI device. + * @ubi: UBI device description object + * + * This function initializes debugging-related data for UBI device @ubi. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubi_debugging_init_dev(struct ubi_device *ubi) +{ + ubi->dbg = kzalloc(sizeof(struct ubi_debug_info), GFP_KERNEL); + if (!ubi->dbg) + return -ENOMEM; + + return 0; +} + +/** + * ubi_debugging_exit_dev - free debugging data for an UBI device. + * @ubi: UBI device description object + */ +void ubi_debugging_exit_dev(struct ubi_device *ubi) +{ + kfree(ubi->dbg); +} + +/* + * Root directory for UBI stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular UBI devices. + */ +static struct dentry *dfs_rootdir; + +/** + * ubi_debugfs_init - create UBI debugfs directory. + * + * Create UBI debugfs directory. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubi_debugfs_init(void) +{ + dfs_rootdir = debugfs_create_dir("ubi", NULL); + if (IS_ERR_OR_NULL(dfs_rootdir)) { + int err = dfs_rootdir ? -ENODEV : PTR_ERR(dfs_rootdir); + + ubi_err("cannot create \"ubi\" debugfs directory, error %d\n", + err); + return err; + } + + return 0; +} + +/** + * ubi_debugfs_exit - remove UBI debugfs directory. + */ +void ubi_debugfs_exit(void) +{ + debugfs_remove(dfs_rootdir); +} + +/* Read an UBI debugfs file */ +static ssize_t dfs_file_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + unsigned long ubi_num = (unsigned long)file->private_data; + struct dentry *dent = file->f_path.dentry; + struct ubi_device *ubi; + struct ubi_debug_info *d; + char buf[3]; + int val; + + ubi = ubi_get_device(ubi_num); + if (!ubi) + return -ENODEV; + d = ubi->dbg; + + if (dent == d->dfs_chk_gen) + val = d->chk_gen; + else if (dent == d->dfs_chk_io) + val = d->chk_io; + else if (dent == d->dfs_disable_bgt) + val = d->disable_bgt; + else if (dent == d->dfs_emulate_bitflips) + val = d->emulate_bitflips; + else if (dent == d->dfs_emulate_io_failures) + val = d->emulate_io_failures; + else { + count = -EINVAL; + goto out; + } + + if (val) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + + count = simple_read_from_buffer(user_buf, count, ppos, buf, 2); + +out: + ubi_put_device(ubi); + return count; +} + +/* Write an UBI debugfs file */ +static ssize_t dfs_file_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + unsigned long ubi_num = (unsigned long)file->private_data; + struct dentry *dent = file->f_path.dentry; + struct ubi_device *ubi; + struct ubi_debug_info *d; + size_t buf_size; + char buf[8]; + int val; + + ubi = ubi_get_device(ubi_num); + if (!ubi) + return -ENODEV; + d = ubi->dbg; + + buf_size = min_t(size_t, count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) { + count = -EFAULT; + goto out; + } + + if (buf[0] == '1') + val = 1; + else if (buf[0] == '0') + val = 0; + else { + count = -EINVAL; + goto out; + } + + if (dent == d->dfs_chk_gen) + d->chk_gen = val; + else if (dent == d->dfs_chk_io) + d->chk_io = val; + else if (dent == d->dfs_disable_bgt) + d->disable_bgt = val; + else if (dent == d->dfs_emulate_bitflips) + d->emulate_bitflips = val; + else if (dent == d->dfs_emulate_io_failures) + d->emulate_io_failures = val; + else + count = -EINVAL; + +out: + ubi_put_device(ubi); + return count; +} + +static int default_open(struct inode *inode, struct file *file) +{ + if (inode->i_private) + file->private_data = inode->i_private; + + return 0; +} + +/* File operations for all UBI debugfs files */ +static const struct file_operations dfs_fops = { + .read = dfs_file_read, + .write = dfs_file_write, + .open = default_open, + .llseek = no_llseek, + .owner = THIS_MODULE, +}; + +/** + * ubi_debugfs_init_dev - initialize debugfs for an UBI device. + * @ubi: UBI device description object + * + * This function creates all debugfs files for UBI device @ubi. Returns zero in + * case of success and a negative error code in case of failure. + */ +int ubi_debugfs_init_dev(struct ubi_device *ubi) +{ + int err, n; + unsigned long ubi_num = ubi->ubi_num; + const char *fname; + struct dentry *dent; + struct ubi_debug_info *d = ubi->dbg; + + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, + ubi->ubi_num); + if (n == UBI_DFS_DIR_LEN) { + /* The array size is too small */ + fname = UBI_DFS_DIR_NAME; + dent = ERR_PTR(-EINVAL); + goto out; + } + + fname = d->dfs_dir_name; + dent = debugfs_create_dir(fname, dfs_rootdir); + if (IS_ERR_OR_NULL(dent)) + goto out; + d->dfs_dir = dent; + + fname = "chk_gen"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, (void *)ubi_num, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_gen = dent; + + fname = "chk_io"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, (void *)ubi_num, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_io = dent; + + fname = "tst_disable_bgt"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, (void *)ubi_num, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_disable_bgt = dent; + + fname = "tst_emulate_bitflips"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, (void *)ubi_num, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_emulate_bitflips = dent; + + fname = "tst_emulate_io_failures"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, (void *)ubi_num, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_emulate_io_failures = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(d->dfs_dir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubi_err("cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debug_exit_dev - free all debugfs files corresponding to device @ubi + * @ubi: UBI device description object + */ +void ubi_debugfs_exit_dev(struct ubi_device *ubi) +{ + debugfs_remove_recursive(ubi->dbg->dfs_dir); +} + #endif /* CONFIG_MTD_UBI_DEBUG */ diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h index 3f1a09c5c438..65b5b76cc379 100644 --- a/drivers/mtd/ubi/debug.h +++ b/drivers/mtd/ubi/debug.h @@ -21,14 +21,6 @@ #ifndef __UBI_DEBUG_H__ #define __UBI_DEBUG_H__ -struct ubi_ec_hdr; -struct ubi_vid_hdr; -struct ubi_volume; -struct ubi_vtbl_record; -struct ubi_scan_volume; -struct ubi_scan_leb; -struct ubi_mkvol_req; - #ifdef CONFIG_MTD_UBI_DEBUG #include @@ -71,86 +63,103 @@ void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv); void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type); void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req); void ubi_dbg_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len); - -extern unsigned int ubi_chk_flags; - -/* - * Debugging check flags. - * - * UBI_CHK_GEN: general checks - * UBI_CHK_IO: check writes and erases - */ -enum { - UBI_CHK_GEN = 0x1, - UBI_CHK_IO = 0x2, -}; - int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len); int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, int len); - -extern unsigned int ubi_tst_flags; +int ubi_debugging_init_dev(struct ubi_device *ubi); +void ubi_debugging_exit_dev(struct ubi_device *ubi); +int ubi_debugfs_init(void); +void ubi_debugfs_exit(void); +int ubi_debugfs_init_dev(struct ubi_device *ubi); +void ubi_debugfs_exit_dev(struct ubi_device *ubi); /* - * Special testing flags. + * The UBI debugfs directory name pattern and maximum name length (3 for "ubi" + * + 2 for the number plus 1 for the trailing zero byte. + */ +#define UBI_DFS_DIR_NAME "ubi%d" +#define UBI_DFS_DIR_LEN (3 + 2 + 1) + +/** + * struct ubi_debug_info - debugging information for an UBI device. * - * UBIFS_TST_DISABLE_BGT: disable the background thread - * UBI_TST_EMULATE_BITFLIPS: emulate bit-flips - * UBI_TST_EMULATE_WRITE_FAILURES: emulate write failures - * UBI_TST_EMULATE_ERASE_FAILURES: emulate erase failures + * @chk_gen: if UBI general extra checks are enabled + * @chk_io: if UBI I/O extra checks are enabled + * @disable_bgt: disable the background task for testing purposes + * @emulate_bitflips: emulate bit-flips for testing purposes + * @emulate_io_failures: emulate write/erase failures for testing purposes + * @dfs_dir_name: name of debugfs directory containing files of this UBI device + * @dfs_dir: direntry object of the UBI device debugfs directory + * @dfs_chk_gen: debugfs knob to enable UBI general extra checks + * @dfs_chk_io: debugfs knob to enable UBI I/O extra checks + * @dfs_disable_bgt: debugfs knob to disable the background task + * @dfs_emulate_bitflips: debugfs knob to emulate bit-flips + * @dfs_emulate_io_failures: debugfs knob to emulate write/erase failures */ -enum { - UBI_TST_DISABLE_BGT = 0x1, - UBI_TST_EMULATE_BITFLIPS = 0x2, - UBI_TST_EMULATE_WRITE_FAILURES = 0x4, - UBI_TST_EMULATE_ERASE_FAILURES = 0x8, +struct ubi_debug_info { + unsigned int chk_gen:1; + unsigned int chk_io:1; + unsigned int disable_bgt:1; + unsigned int emulate_bitflips:1; + unsigned int emulate_io_failures:1; + char dfs_dir_name[UBI_DFS_DIR_LEN + 1]; + struct dentry *dfs_dir; + struct dentry *dfs_chk_gen; + struct dentry *dfs_chk_io; + struct dentry *dfs_disable_bgt; + struct dentry *dfs_emulate_bitflips; + struct dentry *dfs_emulate_io_failures; }; /** * ubi_dbg_is_bgt_disabled - if the background thread is disabled. + * @ubi: UBI device description object * * Returns non-zero if the UBI background thread is disabled for testing * purposes. */ -static inline int ubi_dbg_is_bgt_disabled(void) +static inline int ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi) { - return ubi_tst_flags & UBI_TST_DISABLE_BGT; + return ubi->dbg->disable_bgt; } /** * ubi_dbg_is_bitflip - if it is time to emulate a bit-flip. + * @ubi: UBI device description object * * Returns non-zero if a bit-flip should be emulated, otherwise returns zero. */ -static inline int ubi_dbg_is_bitflip(void) +static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi) { - if (ubi_tst_flags & UBI_TST_EMULATE_BITFLIPS) + if (ubi->dbg->emulate_bitflips) return !(random32() % 200); return 0; } /** * ubi_dbg_is_write_failure - if it is time to emulate a write failure. + * @ubi: UBI device description object * * Returns non-zero if a write failure should be emulated, otherwise returns * zero. */ -static inline int ubi_dbg_is_write_failure(void) +static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi) { - if (ubi_tst_flags & UBI_TST_EMULATE_WRITE_FAILURES) + if (ubi->dbg->emulate_io_failures) return !(random32() % 500); return 0; } /** * ubi_dbg_is_erase_failure - if its time to emulate an erase failure. + * @ubi: UBI device description object * * Returns non-zero if an erase failure should be emulated, otherwise returns * zero. */ -static inline int ubi_dbg_is_erase_failure(void) +static inline int ubi_dbg_is_erase_failure(const struct ubi_device *ubi) { - if (ubi_tst_flags & UBI_TST_EMULATE_ERASE_FAILURES) + if (ubi->dbg->emulate_io_failures) return !(random32() % 400); return 0; } @@ -201,11 +210,6 @@ static inline void ubi_dbg_dump_flash(struct ubi_device *ubi, static inline void ubi_dbg_print_hex_dump(const char *l, const char *ps, int pt, int r, int g, const void *b, size_t len, bool a) { return; } - -static inline int ubi_dbg_is_bgt_disabled(void) { return 0; } -static inline int ubi_dbg_is_bitflip(void) { return 0; } -static inline int ubi_dbg_is_write_failure(void) { return 0; } -static inline int ubi_dbg_is_erase_failure(void) { return 0; } static inline int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) { return 0; } @@ -213,5 +217,20 @@ static inline int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, int len) { return 0; } +static inline int ubi_debugging_init_dev(struct ubi_device *ubi) { return 0; } +static inline void ubi_debugging_exit_dev(struct ubi_device *ubi) { return; } +static inline int ubi_debugfs_init(void) { return 0; } +static inline void ubi_debugfs_exit(void) { return; } +static inline int ubi_debugfs_init_dev(struct ubi_device *ubi) { return 0; } +static inline void ubi_debugfs_exit_dev(struct ubi_device *ubi) { return; } + +static inline int +ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi) { return 0; } +static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi) { return 0; } +static inline int +ubi_dbg_is_write_failure(const struct ubi_device *ubi) { return 0; } +static inline int +ubi_dbg_is_erase_failure(const struct ubi_device *ubi) { return 0; } + #endif /* !CONFIG_MTD_UBI_DEBUG */ #endif /* !__UBI_DEBUG_H__ */ diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index 8c1b1c7bc4a7..6ba55c235873 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -212,7 +212,7 @@ retry: } else { ubi_assert(len == read); - if (ubi_dbg_is_bitflip()) { + if (ubi_dbg_is_bitflip(ubi)) { dbg_gen("bit-flip (emulated)"); err = UBI_IO_BITFLIPS; } @@ -281,7 +281,7 @@ int ubi_io_write(struct ubi_device *ubi, const void *buf, int pnum, int offset, return err; } - if (ubi_dbg_is_write_failure()) { + if (ubi_dbg_is_write_failure(ubi)) { dbg_err("cannot write %d bytes to PEB %d:%d " "(emulated)", len, pnum, offset); ubi_dbg_dump_stack(); @@ -396,7 +396,7 @@ retry: if (err) return err; - if (ubi_dbg_is_erase_failure()) { + if (ubi_dbg_is_erase_failure(ubi)) { dbg_err("cannot erase PEB %d (emulated)", pnum); return -EIO; } @@ -1146,7 +1146,7 @@ static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum) { int err; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; err = ubi_io_is_bad(ubi, pnum); @@ -1173,7 +1173,7 @@ static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum, int err; uint32_t magic; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; magic = be32_to_cpu(ec_hdr->magic); @@ -1211,7 +1211,7 @@ static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum) uint32_t crc, hdr_crc; struct ubi_ec_hdr *ec_hdr; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); @@ -1255,7 +1255,7 @@ static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum, int err; uint32_t magic; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; magic = be32_to_cpu(vid_hdr->magic); @@ -1296,7 +1296,7 @@ static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum) struct ubi_vid_hdr *vid_hdr; void *p; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS); @@ -1348,7 +1348,7 @@ int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum, void *buf1; loff_t addr = (loff_t)pnum * ubi->peb_size + offset; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); @@ -1412,7 +1412,7 @@ int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) void *buf; loff_t addr = (loff_t)pnum * ubi->peb_size + offset; - if (!(ubi_chk_flags & UBI_CHK_IO)) + if (!ubi->dbg->chk_io) return 0; buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c index 2135a53732ff..a3a198f9b98d 100644 --- a/drivers/mtd/ubi/scan.c +++ b/drivers/mtd/ubi/scan.c @@ -1347,7 +1347,7 @@ static int paranoid_check_si(struct ubi_device *ubi, struct ubi_scan_info *si) struct ubi_scan_leb *seb, *last_seb; uint8_t *buf; - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return 0; /* diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index c6c22295898e..dc64c767fd21 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -44,7 +44,6 @@ #include "ubi-media.h" #include "scan.h" -#include "debug.h" /* Maximum number of supported UBI devices */ #define UBI_MAX_DEVICES 32 @@ -390,6 +389,8 @@ struct ubi_wl_entry; * @peb_buf2: another buffer of PEB size used for different purposes * @buf_mutex: protects @peb_buf1 and @peb_buf2 * @ckvol_mutex: serializes static volume checking when opening + * + * @dbg: debugging information for this UBI device */ struct ubi_device { struct cdev cdev; @@ -472,8 +473,12 @@ struct ubi_device { void *peb_buf2; struct mutex buf_mutex; struct mutex ckvol_mutex; + + struct ubi_debug_info *dbg; }; +#include "debug.h" + extern struct kmem_cache *ubi_wl_entry_slab; extern const struct file_operations ubi_ctrl_cdev_operations; extern const struct file_operations ubi_cdev_operations; @@ -662,6 +667,7 @@ static inline void ubi_ro_mode(struct ubi_device *ubi) if (!ubi->ro_mode) { ubi->ro_mode = 1; ubi_warn("switch to read-only mode"); + ubi_dbg_dump_stack(); } } diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 366eb70219a6..97e093d19672 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -871,7 +871,7 @@ static int paranoid_check_volumes(struct ubi_device *ubi) { int i, err = 0; - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return 0; for (i = 0; i < ubi->vtbl_slots; i++) { diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index fd3bf770f518..4b50a3029b84 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -307,8 +307,7 @@ static int create_vtbl(struct ubi_device *ubi, struct ubi_scan_info *si, { int err, tries = 0; static struct ubi_vid_hdr *vid_hdr; - struct ubi_scan_volume *sv; - struct ubi_scan_leb *new_seb, *old_seb = NULL; + struct ubi_scan_leb *new_seb; ubi_msg("create volume table (copy #%d)", copy + 1); @@ -316,15 +315,6 @@ static int create_vtbl(struct ubi_device *ubi, struct ubi_scan_info *si, if (!vid_hdr) return -ENOMEM; - /* - * Check if there is a logical eraseblock which would have to contain - * this volume table copy was found during scanning. It has to be wiped - * out. - */ - sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOLUME_ID); - if (sv) - old_seb = ubi_scan_find_seb(sv, copy); - retry: new_seb = ubi_scan_get_free_peb(ubi, si); if (IS_ERR(new_seb)) { @@ -351,8 +341,8 @@ retry: goto write_error; /* - * And add it to the scanning information. Don't delete the old - * @old_seb as it will be deleted and freed in 'ubi_scan_add_used()'. + * And add it to the scanning information. Don't delete the old version + * of this LEB as it will be deleted and freed in 'ubi_scan_add_used()'. */ err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec, vid_hdr, 0); @@ -876,7 +866,7 @@ out_free: */ static void paranoid_vtbl_check(const struct ubi_device *ubi) { - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return; if (vtbl_check(ubi, ubi->vtbl)) { diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index ff2c4956eeff..42c684cf3688 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1,4 +1,5 @@ /* + * @ubi: UBI device description object * Copyright (c) International Business Machines Corp., 2006 * * This program is free software; you can redistribute it and/or modify @@ -163,12 +164,14 @@ struct ubi_work { #ifdef CONFIG_MTD_UBI_DEBUG static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec); -static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, +static int paranoid_check_in_wl_tree(const struct ubi_device *ubi, + struct ubi_wl_entry *e, struct rb_root *root); -static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e); +static int paranoid_check_in_pq(const struct ubi_device *ubi, + struct ubi_wl_entry *e); #else #define paranoid_check_ec(ubi, pnum, ec) 0 -#define paranoid_check_in_wl_tree(e, root) +#define paranoid_check_in_wl_tree(ubi, e, root) #define paranoid_check_in_pq(ubi, e) 0 #endif @@ -449,7 +452,7 @@ retry: BUG(); } - paranoid_check_in_wl_tree(e, &ubi->free); + paranoid_check_in_wl_tree(ubi, e, &ubi->free); /* * Move the physical eraseblock to the protection queue where it will @@ -613,7 +616,7 @@ static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) list_add_tail(&wrk->list, &ubi->works); ubi_assert(ubi->works_count >= 0); ubi->works_count += 1; - if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled()) + if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled(ubi)) wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); } @@ -712,7 +715,7 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, e1->ec, e2->ec); goto out_cancel; } - paranoid_check_in_wl_tree(e1, &ubi->used); + paranoid_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("move PEB %d EC %d to PEB %d EC %d", e1->pnum, e1->ec, e2->pnum, e2->ec); @@ -721,12 +724,12 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, scrubbing = 1; e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); - paranoid_check_in_wl_tree(e1, &ubi->scrub); + paranoid_check_in_wl_tree(ubi, e1, &ubi->scrub); rb_erase(&e1->u.rb, &ubi->scrub); dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); } - paranoid_check_in_wl_tree(e2, &ubi->free); + paranoid_check_in_wl_tree(ubi, e2, &ubi->free); rb_erase(&e2->u.rb, &ubi->free); ubi->move_from = e1; ubi->move_to = e2; @@ -1169,13 +1172,13 @@ retry: return 0; } else { if (in_wl_tree(e, &ubi->used)) { - paranoid_check_in_wl_tree(e, &ubi->used); + paranoid_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else if (in_wl_tree(e, &ubi->scrub)) { - paranoid_check_in_wl_tree(e, &ubi->scrub); + paranoid_check_in_wl_tree(ubi, e, &ubi->scrub); rb_erase(&e->u.rb, &ubi->scrub); } else if (in_wl_tree(e, &ubi->erroneous)) { - paranoid_check_in_wl_tree(e, &ubi->erroneous); + paranoid_check_in_wl_tree(ubi, e, &ubi->erroneous); rb_erase(&e->u.rb, &ubi->erroneous); ubi->erroneous_peb_count -= 1; ubi_assert(ubi->erroneous_peb_count >= 0); @@ -1242,7 +1245,7 @@ retry: } if (in_wl_tree(e, &ubi->used)) { - paranoid_check_in_wl_tree(e, &ubi->used); + paranoid_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else { int err; @@ -1364,7 +1367,7 @@ int ubi_thread(void *u) spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works) || ubi->ro_mode || - !ubi->thread_enabled || ubi_dbg_is_bgt_disabled()) { + !ubi->thread_enabled || ubi_dbg_is_bgt_disabled(ubi)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&ubi->wl_lock); schedule(); @@ -1579,7 +1582,7 @@ static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec) long long read_ec; struct ubi_ec_hdr *ec_hdr; - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); @@ -1609,16 +1612,18 @@ out_free: /** * paranoid_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree. + * @ubi: UBI device description object * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns zero if @e is in the @root RB-tree and %-EINVAL if it * is not. */ -static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, +static int paranoid_check_in_wl_tree(const struct ubi_device *ubi, + struct ubi_wl_entry *e, struct rb_root *root) { - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return 0; if (in_wl_tree(e, root)) @@ -1638,12 +1643,13 @@ static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, * * This function returns zero if @e is in @ubi->pq and %-EINVAL if it is not. */ -static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e) +static int paranoid_check_in_pq(const struct ubi_device *ubi, + struct ubi_wl_entry *e) { struct ubi_wl_entry *p; int i; - if (!(ubi_chk_flags & UBI_CHK_GEN)) + if (!ubi->dbg->chk_gen) return 0; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 02145e9697a9..1196f61a4ab6 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2758,6 +2758,29 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) dev_notice(&dev->dev, "proprietary Ricoh MMC controller disabled (via firewire function)\n"); dev_notice(&dev->dev, "MMC cards are now supported by standard SDHCI controller\n"); + + /* + * RICOH 0xe823 SD/MMC card reader fails to recognize + * certain types of SD/MMC cards. Lowering the SD base + * clock frequency from 200Mhz to 50Mhz fixes this issue. + * + * 0x150 - SD2.0 mode enable for changing base clock + * frequency to 50Mhz + * 0xe1 - Base clock frequency + * 0x32 - 50Mhz new clock frequency + * 0xf9 - Key register for 0x150 + * 0xfc - key register for 0xe1 + */ + if (dev->device == PCI_DEVICE_ID_RICOH_R5CE823) { + pci_write_config_byte(dev, 0xf9, 0xfc); + pci_write_config_byte(dev, 0x150, 0x10); + pci_write_config_byte(dev, 0xf9, 0x00); + pci_write_config_byte(dev, 0xfc, 0x01); + pci_write_config_byte(dev, 0xe1, 0x32); + pci_write_config_byte(dev, 0xfc, 0x00); + + dev_notice(&dev->dev, "MMC controller base frequency changed to 50Mhz.\n"); + } } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a59638b37c1a..03bc471c3eed 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -9,6 +9,23 @@ config XEN_BALLOON the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. +config XEN_SELFBALLOONING + bool "Dynamically self-balloon kernel memory to target" + depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP + default n + help + Self-ballooning dynamically balloons available kernel memory driven + by the current usage of anonymous memory ("committed AS") and + controlled by various sysfs-settable parameters. Configuring + FRONTSWAP is highly recommended; if it is not configured, self- + ballooning is disabled by default but can be enabled with the + 'selfballooning' kernel boot parameter. If FRONTSWAP is configured, + frontswap-selfshrinking is enabled by default but can be disabled + with the 'noselfshrink' kernel boot parameter; and self-ballooning + is enabled by default but can be disabled with the 'noselfballooning' + kernel boot parameter. Note that systems without a sufficiently + large swap device should not enable self-ballooning. + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON @@ -105,4 +122,33 @@ config SWIOTLB_XEN depends on PCI select SWIOTLB +config XEN_TMEM + bool + default y if (CLEANCACHE || FRONTSWAP) + help + Shim to interface in-kernel Transcendent Memory hooks + (e.g. cleancache and frontswap) to Xen tmem hypercalls. + +config XEN_PCIDEV_BACKEND + tristate "Xen PCI-device backend driver" + depends on PCI && X86 && XEN + depends on XEN_BACKEND + default m + help + The PCI device backend driver allows the kernel to export arbitrary + PCI devices to other guests. If you select this to be a module, you + will need to make sure no other driver has bound to the device(s) + you want to make visible to other guests. + + The parameter "passthrough" allows you specify how you want the PCI + devices to appear in the guest. You can choose the default (0) where + PCI topology starts at 00.00.0, or (1) for passthrough if you want + the PCI devices topology appear the same as in the host. + + The "hide" parameter (only applicable if backend driver is compiled + into the kernel) allows you to bind the PCI devices to this module + from the default device drivers. The argument is the list of PCI BDFs: + xen-pciback.hide=(03:00.0)(04:00.0) + + If in doubt, say m. endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index bbc18258ecc5..72bbb27d7a68 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,5 @@ obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ -obj-y += tmem.o nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) @@ -9,14 +8,17 @@ obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o +obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o +obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 30df85d8fca8..da70f5c32eb9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -615,11 +615,6 @@ static int find_irq_by_gsi(unsigned gsi) return -1; } -int xen_allocate_pirq_gsi(unsigned gsi) -{ - return gsi; -} - /* * Do not make any assumptions regarding the relationship between the * IRQ number returned here and the Xen pirq argument. @@ -1693,6 +1688,6 @@ void __init xen_init_IRQ(void) } else { irq_ctx_init(smp_processor_id()); if (xen_initial_domain()) - xen_setup_pirqs(); + pci_xen_initial_domain(); } } diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 816a44959ef0..d369965e8f8a 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -1,7 +1,7 @@ /* * Xen implementation for transcendent memory (tmem) * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Copyright (C) 2009-2011 Oracle Corp. All rights reserved. * Author: Dan Magenheimer */ @@ -9,8 +9,14 @@ #include #include #include +#include #include +/* temporary ifdef until include/linux/frontswap.h is upstream */ +#ifdef CONFIG_FRONTSWAP +#include +#endif + #include #include #include @@ -122,14 +128,8 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); } -static int xen_tmem_destroy_pool(u32 pool_id) -{ - struct tmem_oid oid = { { 0 } }; - - return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); -} - -int tmem_enabled; +int tmem_enabled __read_mostly; +EXPORT_SYMBOL(tmem_enabled); static int __init enable_tmem(char *s) { @@ -139,6 +139,14 @@ static int __init enable_tmem(char *s) __setup("tmem", enable_tmem); +#ifdef CONFIG_CLEANCACHE +static int xen_tmem_destroy_pool(u32 pool_id) +{ + struct tmem_oid oid = { { 0 } }; + + return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); +} + /* cleancache ops */ static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, @@ -240,18 +248,156 @@ static struct cleancache_ops tmem_cleancache_ops = { .init_shared_fs = tmem_cleancache_init_shared_fs, .init_fs = tmem_cleancache_init_fs }; +#endif -static int __init xen_tmem_init(void) +#ifdef CONFIG_FRONTSWAP +/* frontswap tmem operations */ + +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int tmem_frontswap_poolid; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) { - struct cleancache_ops old_ops; + struct tmem_oid oid = { .oid = { 0 } }; + oid.oid[0] = _oswiz(type, ind); + return oid; +} +/* returns 0 if the page was successfully put into frontswap, -1 if not */ +static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* + * returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) + */ +static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* flush a single page from frontswap */ +static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + int pool = tmem_frontswap_poolid; + + if (pool < 0) + return; + if (ind64 != ind) + return; + (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void tmem_frontswap_flush_area(unsigned type) +{ + int pool = tmem_frontswap_poolid; + int ind; + + if (pool < 0) + return; + for (ind = SWIZ_MASK; ind >= 0; ind--) + (void)xen_tmem_flush_object(pool, oswiz(type, ind)); +} + +static void tmem_frontswap_init(unsigned ignored) +{ + struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; + + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (tmem_frontswap_poolid < 0) + tmem_frontswap_poolid = + xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); +} + +static int __initdata use_frontswap = 1; + +static int __init no_frontswap(char *s) +{ + use_frontswap = 0; + return 1; +} + +__setup("nofrontswap", no_frontswap); + +static struct frontswap_ops tmem_frontswap_ops = { + .put_page = tmem_frontswap_put_page, + .get_page = tmem_frontswap_get_page, + .flush_page = tmem_frontswap_flush_page, + .flush_area = tmem_frontswap_flush_area, + .init = tmem_frontswap_init +}; +#endif + +static int __init xen_tmem_init(void) +{ if (!xen_domain()) return 0; +#ifdef CONFIG_FRONTSWAP + if (tmem_enabled && use_frontswap) { + char *s = ""; + struct frontswap_ops old_ops = + frontswap_register_ops(&tmem_frontswap_ops); + + tmem_frontswap_poolid = -1; + if (old_ops.init != NULL) + s = " (WARNING: frontswap_ops overridden)"; + printk(KERN_INFO "frontswap enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); if (tmem_enabled && use_cleancache) { char *s = ""; - old_ops = cleancache_register_ops(&tmem_cleancache_ops); + struct cleancache_ops old_ops = + cleancache_register_ops(&tmem_cleancache_ops); if (old_ops.init_fs != NULL) s = " (WARNING: cleancache_ops overridden)"; printk(KERN_INFO "cleancache enabled, RAM provided by " diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index a4ff225ee868..5c9dc43c1e94 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -98,6 +98,8 @@ static int __init balloon_init(void) register_balloon(&balloon_sysdev); + register_xen_selfballooning(&balloon_sysdev); + target_watch.callback = watch_target; xenstore_notifier.notifier_call = balloon_init_watcher; diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile new file mode 100644 index 000000000000..ffe0ad3438bd --- /dev/null +++ b/drivers/xen/xen-pciback/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o + +xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o +xen-pciback-y += conf_space.o conf_space_header.o \ + conf_space_capability.o \ + conf_space_quirks.o vpci.o \ + passthrough.o diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c new file mode 100644 index 000000000000..a8031445d94e --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.c @@ -0,0 +1,438 @@ +/* + * PCI Backend - Functions for creating a virtual configuration space for + * exported PCI Devices. + * It's dangerous to allow PCI Driver Domains to change their + * device's resources (memory, i/o ports, interrupts). We need to + * restrict changes to certain PCI Configuration registers: + * BARs, INTERRUPT_PIN, most registers in the header... + * + * Author: Ryan Wilson + */ + +#include +#include +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +#define DRV_NAME "xen-pciback" +static int permissive; +module_param(permissive, bool, 0644); + +/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, + * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */ +#define DEFINE_PCI_CONFIG(op, size, type) \ +int xen_pcibk_##op##_config_##size \ +(struct pci_dev *dev, int offset, type value, void *data) \ +{ \ + return pci_##op##_config_##size(dev, offset, value); \ +} + +DEFINE_PCI_CONFIG(read, byte, u8 *) +DEFINE_PCI_CONFIG(read, word, u16 *) +DEFINE_PCI_CONFIG(read, dword, u32 *) + +DEFINE_PCI_CONFIG(write, byte, u8) +DEFINE_PCI_CONFIG(write, word, u16) +DEFINE_PCI_CONFIG(write, dword, u32) + +static int conf_space_read(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 *value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + *value = 0; + + switch (field->size) { + case 1: + if (field->u.b.read) + ret = field->u.b.read(dev, offset, (u8 *) value, + entry->data); + break; + case 2: + if (field->u.w.read) + ret = field->u.w.read(dev, offset, (u16 *) value, + entry->data); + break; + case 4: + if (field->u.dw.read) + ret = field->u.dw.read(dev, offset, value, entry->data); + break; + } + return ret; +} + +static int conf_space_write(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + switch (field->size) { + case 1: + if (field->u.b.write) + ret = field->u.b.write(dev, offset, (u8) value, + entry->data); + break; + case 2: + if (field->u.w.write) + ret = field->u.w.write(dev, offset, (u16) value, + entry->data); + break; + case 4: + if (field->u.dw.write) + ret = field->u.dw.write(dev, offset, value, + entry->data); + break; + } + return ret; +} + +static inline u32 get_mask(int size) +{ + if (size == 1) + return 0xff; + else if (size == 2) + return 0xffff; + else + return 0xffffffff; +} + +static inline int valid_request(int offset, int size) +{ + /* Validate request (no un-aligned requests) */ + if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) + return 1; + return 0; +} + +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, + int offset) +{ + if (offset >= 0) { + new_val_mask <<= (offset * 8); + new_val <<= (offset * 8); + } else { + new_val_mask >>= (offset * -8); + new_val >>= (offset * -8); + } + val = (val & ~new_val_mask) | (new_val & new_val_mask); + + return val; +} + +static int pcibios_err_to_errno(int err) +{ + switch (err) { + case PCIBIOS_SUCCESSFUL: + return XEN_PCI_ERR_success; + case PCIBIOS_DEVICE_NOT_FOUND: + return XEN_PCI_ERR_dev_not_found; + case PCIBIOS_BAD_REGISTER_NUMBER: + return XEN_PCI_ERR_invalid_offset; + case PCIBIOS_FUNC_NOT_SUPPORTED: + return XEN_PCI_ERR_not_implemented; + case PCIBIOS_SET_FAILED: + return XEN_PCI_ERR_access_denied; + } + return err; +} + +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + int req_start, req_end, field_start, field_end; + /* if read fails for any reason, return 0 + * (as if device didn't respond) */ + u32 value = 0, tmp_val; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", + pci_name(dev), size, offset); + + if (!valid_request(offset, size)) { + err = XEN_PCI_ERR_invalid_offset; + goto out; + } + + /* Get the real value first, then modify as appropriate */ + switch (size) { + case 1: + err = pci_read_config_byte(dev, offset, (u8 *) &value); + break; + case 2: + err = pci_read_config_word(dev, offset, (u16 *) &value); + break; + case 4: + err = pci_read_config_dword(dev, offset, &value); + break; + } + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); + if (err) + goto out; + + value = merge_value(value, tmp_val, + get_mask(field->size), + field_start - req_start); + } + } + +out: + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + *ret_val = value; + return pcibios_err_to_errno(err); +} + +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) +{ + int err = 0, handled = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + u32 tmp_val; + int req_start, req_end, field_start, field_end; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + if (!valid_request(offset, size)) + return XEN_PCI_ERR_invalid_offset; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + tmp_val = 0; + + err = xen_pcibk_config_read(dev, field_start, + field->size, &tmp_val); + if (err) + break; + + tmp_val = merge_value(tmp_val, value, get_mask(size), + req_start - field_start); + + err = conf_space_write(dev, cfg_entry, field_start, + tmp_val); + + /* handled is set true here, but not every byte + * may have been written! Properly detecting if + * every byte is handled is unnecessary as the + * flag is used to detect devices that need + * special helpers to work correctly. + */ + handled = 1; + } + } + + if (!handled && !err) { + /* By default, anything not specificially handled above is + * read-only. The permissive flag changes this behavior so + * that anything not specifically handled above is writable. + * This means that some fields may still be read-only because + * they have entries in the config_field list that intercept + * the write and do nothing. */ + if (dev_data->permissive || permissive) { + switch (size) { + case 1: + err = pci_write_config_byte(dev, offset, + (u8) value); + break; + case 2: + err = pci_write_config_word(dev, offset, + (u16) value); + break; + case 4: + err = pci_write_config_dword(dev, offset, + (u32) value); + break; + } + } else if (!dev_data->warned_on_write) { + dev_data->warned_on_write = 1; + dev_warn(&dev->dev, "Driver tried to write to a " + "read-only configuration space field at offset" + " 0x%x, size %d. This may be harmless, but if " + "you have problems with your device:\n" + "1) see permissive attribute in sysfs\n" + "2) report problems to the xen-devel " + "mailing list along with details of your " + "device obtained from lspci.\n", offset, size); + } + } + + return pcibios_err_to_errno(err); +} + +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing dynamically allocated virtual " + "configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->clean) { + field->clean((struct config_field *)field); + + kfree(cfg_entry->data); + + list_del(&cfg_entry->list); + kfree(cfg_entry); + } + + } +} + +void xen_pcibk_config_reset_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + + dev_dbg(&dev->dev, "resetting virtual configuration space\n"); + if (!dev_data) + return; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->reset) + field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); + } +} + +void xen_pcibk_config_free_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + list_del(&cfg_entry->list); + + field = cfg_entry->field; + + if (field->release) + field->release(dev, OFFSET(cfg_entry), cfg_entry->data); + + kfree(cfg_entry); + } +} + +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int base_offset) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + void *tmp; + + cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); + if (!cfg_entry) { + err = -ENOMEM; + goto out; + } + + cfg_entry->data = NULL; + cfg_entry->field = field; + cfg_entry->base_offset = base_offset; + + /* silently ignore duplicate fields */ + err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry)); + if (err) + goto out; + + if (field->init) { + tmp = field->init(dev, OFFSET(cfg_entry)); + + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto out; + } + + cfg_entry->data = tmp; + } + + dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", + OFFSET(cfg_entry)); + list_add_tail(&cfg_entry->list, &dev_data->config_fields); + +out: + if (err) + kfree(cfg_entry); + + return err; +} + +/* This sets up the device's virtual configuration space to keep track of + * certain registers (like the base address registers (BARs) so that we can + * keep the client from manipulating them directly. + */ +int xen_pcibk_config_init_dev(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "initializing virtual configuration space\n"); + + INIT_LIST_HEAD(&dev_data->config_fields); + + err = xen_pcibk_config_header_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_capability_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_quirks_init(dev); + +out: + return err; +} + +int xen_pcibk_config_init(void) +{ + return xen_pcibk_config_capability_init(); +} diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h new file mode 100644 index 000000000000..e56c934ad137 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.h @@ -0,0 +1,126 @@ +/* + * PCI Backend - Common data structures for overriding the configuration space + * + * Author: Ryan Wilson + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_H__ +#define __XEN_PCIBACK_CONF_SPACE_H__ + +#include +#include + +/* conf_field_init can return an errno in a ptr with ERR_PTR() */ +typedef void *(*conf_field_init) (struct pci_dev *dev, int offset); +typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data); +typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data); + +typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value, + void *data); +typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value, + void *data); +typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value, + void *data); +typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value, + void *data); +typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value, + void *data); +typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value, + void *data); + +/* These are the fields within the configuration space which we + * are interested in intercepting reads/writes to and changing their + * values. + */ +struct config_field { + unsigned int offset; + unsigned int size; + unsigned int mask; + conf_field_init init; + conf_field_reset reset; + conf_field_free release; + void (*clean) (struct config_field *field); + union { + struct { + conf_dword_write write; + conf_dword_read read; + } dw; + struct { + conf_word_write write; + conf_word_read read; + } w; + struct { + conf_byte_write write; + conf_byte_read read; + } b; + } u; + struct list_head list; +}; + +struct config_field_entry { + struct list_head list; + const struct config_field *field; + unsigned int base_offset; + void *data; +}; + +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) + +/* Add fields to a device - the add_fields macro expects to get a pointer to + * the first entry in an array (of which the ending is marked by size==0) + */ +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset); + +static inline int xen_pcibk_config_add_field(struct pci_dev *dev, + const struct config_field *field) +{ + return xen_pcibk_config_add_field_offset(dev, field, 0); +} + +static inline int xen_pcibk_config_add_fields(struct pci_dev *dev, + const struct config_field *field) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field(dev, &field[i]); + if (err) + break; + } + return err; +} + +static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field_offset(dev, &field[i], offset); + if (err) + break; + } + return err; +} + +/* Read/Write the real configuration space */ +int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value, + void *data); +int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value, + void *data); +int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value, + void *data); +int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value, + void *data); +int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value, + void *data); +int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value, + void *data); + +int xen_pcibk_config_capability_init(void); + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev); +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); + +#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c new file mode 100644 index 000000000000..7f83e9083e9d --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -0,0 +1,207 @@ +/* + * PCI Backend - Handles the virtual fields found on the capability lists + * in the configuration space. + * + * Author: Ryan Wilson + */ + +#include +#include +#include "pciback.h" +#include "conf_space.h" + +static LIST_HEAD(capabilities); +struct xen_pcibk_config_capability { + struct list_head cap_list; + + int capability; + + /* If the device has the capability found above, add these fields */ + const struct config_field *fields; +}; + +static const struct config_field caplist_header[] = { + { + .offset = PCI_CAP_LIST_ID, + .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = NULL, + }, + {} +}; + +static inline void register_capability(struct xen_pcibk_config_capability *cap) +{ + list_add_tail(&cap->cap_list, &capabilities); +} + +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_config_capability *cap; + int cap_offset; + + list_for_each_entry(cap, &capabilities, cap_list) { + cap_offset = pci_find_capability(dev, cap->capability); + if (cap_offset) { + dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", + cap->capability, cap_offset); + + err = xen_pcibk_config_add_fields_offset(dev, + caplist_header, + cap_offset); + if (err) + goto out; + err = xen_pcibk_config_add_fields_offset(dev, + cap->fields, + cap_offset); + if (err) + goto out; + } + } + +out: + return err; +} + +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, + void *data) +{ + /* Disallow writes to the vital product data */ + if (value & PCI_VPD_ADDR_F) + return PCIBIOS_SET_FAILED; + else + return pci_write_config_word(dev, offset, value); +} + +static const struct config_field caplist_vpd[] = { + { + .offset = PCI_VPD_ADDR, + .size = 2, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = vpd_address_write, + }, + { + .offset = PCI_VPD_DATA, + .size = 4, + .u.dw.read = xen_pcibk_read_config_dword, + .u.dw.write = NULL, + }, + {} +}; + +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, + void *data) +{ + int err; + u16 real_value; + + err = pci_read_config_word(dev, offset, &real_value); + if (err) + goto out; + + *value = real_value & ~PCI_PM_CAP_PME_MASK; + +out: + return err; +} + +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. + * Can't allow driver domain to enable PMEs - they're shared */ +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) + +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + pci_power_t new_state, old_state; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + goto out; + + old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); + new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + + new_value &= PM_OK_BITS; + if ((old_value & PM_OK_BITS) != new_value) { + new_value = (old_value & ~PM_OK_BITS) | new_value; + err = pci_write_config_word(dev, offset, new_value); + if (err) + goto out; + } + + /* Let pci core handle the power management change */ + dev_dbg(&dev->dev, "set power state to %x\n", new_state); + err = pci_set_power_state(dev, new_state); + if (err) { + err = PCIBIOS_SET_FAILED; + goto out; + } + + out: + return err; +} + +/* Ensure PMEs are disabled */ +static void *pm_ctrl_init(struct pci_dev *dev, int offset) +{ + int err; + u16 value; + + err = pci_read_config_word(dev, offset, &value); + if (err) + goto out; + + if (value & PCI_PM_CTRL_PME_ENABLE) { + value &= ~PCI_PM_CTRL_PME_ENABLE; + err = pci_write_config_word(dev, offset, value); + } + +out: + return ERR_PTR(err); +} + +static const struct config_field caplist_pm[] = { + { + .offset = PCI_PM_PMC, + .size = 2, + .u.w.read = pm_caps_read, + }, + { + .offset = PCI_PM_CTRL, + .size = 2, + .init = pm_ctrl_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = pm_ctrl_write, + }, + { + .offset = PCI_PM_PPB_EXTENSIONS, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_PM_DATA_REGISTER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + {} +}; + +static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { + .capability = PCI_CAP_ID_PM, + .fields = caplist_pm, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { + .capability = PCI_CAP_ID_VPD, + .fields = caplist_vpd, +}; + +int xen_pcibk_config_capability_init(void) +{ + register_capability(&xen_pcibk_config_capability_vpd); + register_capability(&xen_pcibk_config_capability_pm); + + return 0; +} diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c new file mode 100644 index 000000000000..da3cbdfcb5dc --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -0,0 +1,386 @@ +/* + * PCI Backend - Handles the virtual fields in the configuration space headers. + * + * Author: Ryan Wilson + */ + +#include +#include +#include "pciback.h" +#include "conf_space.h" + +struct pci_bar_info { + u32 val; + u32 len_val; + int which; +}; + +#define DRV_NAME "xen-pciback" +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ + int i; + int ret; + + ret = xen_pcibk_read_config_word(dev, offset, value, data); + if (!atomic_read(&dev->enable_cnt)) + return ret; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + if (dev->resource[i].flags & IORESOURCE_IO) + *value |= PCI_COMMAND_IO; + if (dev->resource[i].flags & IORESOURCE_MEM) + *value |= PCI_COMMAND_MEMORY; + } + + return ret; +} + +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) +{ + struct xen_pcibk_dev_data *dev_data; + int err; + + dev_data = pci_get_drvdata(dev); + if (!pci_is_enabled(dev) && is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable\n", + pci_name(dev)); + err = pci_enable_device(dev); + if (err) + return err; + if (dev_data) + dev_data->enable_intx = 1; + } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable\n", + pci_name(dev)); + pci_disable_device(dev); + if (dev_data) + dev_data->enable_intx = 0; + } + + if (!dev->is_busmaster && is_master_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", + pci_name(dev)); + pci_set_master(dev); + } + + if (value & PCI_COMMAND_INVALIDATE) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: enable memory-write-invalidate\n", + pci_name(dev)); + err = pci_set_mwi(dev); + if (err) { + printk(KERN_WARNING + DRV_NAME ": %s: cannot enable " + "memory-write-invalidate (%d)\n", + pci_name(dev), err); + value &= ~PCI_COMMAND_INVALIDATE; + } + } + + return pci_write_config_word(dev, offset, value); +} + +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~PCI_ROM_ADDRESS_ENABLE) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + /* Do we need to support enabling/disabling the rom address here? */ + + return 0; +} + +/* For the BARs, only allow writes which write ~0 or + * the correct resource information + * (Needed for when the driver probes the resource usage) + */ +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~0) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + return 0; +} + +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + *value = bar->which ? bar->len_val : bar->val; + + return 0; +} + +static inline void read_dev_bar(struct pci_dev *dev, + struct pci_bar_info *bar_info, int offset, + u32 len_mask) +{ + int pos; + struct resource *res = dev->resource; + + if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) + pos = PCI_ROM_RESOURCE; + else { + pos = (offset - PCI_BASE_ADDRESS_0) / 4; + if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | + PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == + (PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64))) { + bar_info->val = res[pos - 1].start >> 32; + bar_info->len_val = res[pos - 1].end >> 32; + return; + } + } + + bar_info->val = res[pos].start | + (res[pos].flags & PCI_REGION_FLAG_MASK); + bar_info->len_val = res[pos].end - res[pos].start + 1; +} + +static void *bar_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~0); + bar->which = 0; + + return bar; +} + +static void *rom_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); + bar->which = 0; + + return bar; +} + +static void bar_reset(struct pci_dev *dev, int offset, void *data) +{ + struct pci_bar_info *bar = data; + + bar->which = 0; +} + +static void bar_release(struct pci_dev *dev, int offset, void *data) +{ + kfree(data); +} + +static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->vendor; + + return 0; +} + +static int xen_pcibk_read_device(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->device; + + return 0; +} + +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, + void *data) +{ + *value = (u8) dev->irq; + + return 0; +} + +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) +{ + u8 cur_value; + int err; + + err = pci_read_config_byte(dev, offset, &cur_value); + if (err) + goto out; + + if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) + || value == PCI_BIST_START) + err = pci_write_config_byte(dev, offset, value); + +out: + return err; +} + +static const struct config_field header_common[] = { + { + .offset = PCI_VENDOR_ID, + .size = 2, + .u.w.read = xen_pcibk_read_vendor, + }, + { + .offset = PCI_DEVICE_ID, + .size = 2, + .u.w.read = xen_pcibk_read_device, + }, + { + .offset = PCI_COMMAND, + .size = 2, + .u.w.read = command_read, + .u.w.write = command_write, + }, + { + .offset = PCI_INTERRUPT_LINE, + .size = 1, + .u.b.read = interrupt_read, + }, + { + .offset = PCI_INTERRUPT_PIN, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + /* Any side effects of letting driver domain control cache line? */ + .offset = PCI_CACHE_LINE_SIZE, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = xen_pcibk_write_config_byte, + }, + { + .offset = PCI_LATENCY_TIMER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_BIST, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = bist_write, + }, + {} +}; + +#define CFG_FIELD_BAR(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = bar_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = bar_write, \ + } + +#define CFG_FIELD_ROM(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = rom_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = rom_write, \ + } + +static const struct config_field header_0[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), + CFG_FIELD_ROM(PCI_ROM_ADDRESS), + {} +}; + +static const struct config_field header_1[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_ROM(PCI_ROM_ADDRESS1), + {} +}; + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev) +{ + int err; + + err = xen_pcibk_config_add_fields(dev, header_common); + if (err) + goto out; + + switch (dev->hdr_type) { + case PCI_HEADER_TYPE_NORMAL: + err = xen_pcibk_config_add_fields(dev, header_0); + break; + + case PCI_HEADER_TYPE_BRIDGE: + err = xen_pcibk_config_add_fields(dev, header_1); + break; + + default: + err = -EINVAL; + printk(KERN_ERR DRV_NAME ": %s: Unsupported header type %d!\n", + pci_name(dev), dev->hdr_type); + break; + } + +out: + return err; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c new file mode 100644 index 000000000000..921a889e65eb --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -0,0 +1,140 @@ +/* + * PCI Backend - Handle special overlays for broken devices. + * + * Author: Ryan Wilson + * Author: Chris Bookholt + */ + +#include +#include +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +LIST_HEAD(xen_pcibk_quirks); +#define DRV_NAME "xen-pciback" +static inline const struct pci_device_id * +match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ + if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && + (id->device == PCI_ANY_ID || id->device == dev->device) && + (id->subvendor == PCI_ANY_ID || + id->subvendor == dev->subsystem_vendor) && + (id->subdevice == PCI_ANY_ID || + id->subdevice == dev->subsystem_device) && + !((id->class ^ dev->class) & id->class_mask)) + return id; + return NULL; +} + +static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *tmp_quirk; + + list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list) + if (match_one_device(&tmp_quirk->devid, dev) != NULL) + goto out; + tmp_quirk = NULL; + printk(KERN_DEBUG DRV_NAME + ":quirk didn't match any device xen_pciback knows about\n"); +out: + return tmp_quirk; +} + +static inline void register_quirk(struct xen_pcibk_config_quirk *quirk) +{ + list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks); +} + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg) +{ + int ret = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + if (OFFSET(cfg_entry) == reg) { + ret = 1; + break; + } + } + return ret; +} + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field) +{ + int err = 0; + + switch (field->size) { + case 1: + field->u.b.read = xen_pcibk_read_config_byte; + field->u.b.write = xen_pcibk_write_config_byte; + break; + case 2: + field->u.w.read = xen_pcibk_read_config_word; + field->u.w.write = xen_pcibk_write_config_word; + break; + case 4: + field->u.dw.read = xen_pcibk_read_config_dword; + field->u.dw.write = xen_pcibk_write_config_dword; + break; + default: + err = -EINVAL; + goto out; + } + + xen_pcibk_config_add_field(dev, field); + +out: + return err; +} + +int xen_pcibk_config_quirks_init(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); + if (!quirk) { + ret = -ENOMEM; + goto out; + } + + quirk->devid.vendor = dev->vendor; + quirk->devid.device = dev->device; + quirk->devid.subvendor = dev->subsystem_vendor; + quirk->devid.subdevice = dev->subsystem_device; + quirk->devid.class = 0; + quirk->devid.class_mask = 0; + quirk->devid.driver_data = 0UL; + + quirk->pdev = dev; + + register_quirk(quirk); +out: + return ret; +} + +void xen_pcibk_config_field_free(struct config_field *field) +{ + kfree(field); +} + +int xen_pcibk_config_quirk_release(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = xen_pcibk_find_quirk(dev); + if (!quirk) { + ret = -ENXIO; + goto out; + } + + list_del(&quirk->quirks_list); + kfree(quirk); + +out: + return ret; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h new file mode 100644 index 000000000000..cfcc517e4570 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -0,0 +1,33 @@ +/* + * PCI Backend - Data structures for special overlays for broken devices. + * + * Ryan Wilson + * Chris Bookholt + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ + +#include +#include + +struct xen_pcibk_config_quirk { + struct list_head quirks_list; + struct pci_device_id devid; + struct pci_dev *pdev; +}; + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field); + +int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); + +int xen_pcibk_config_quirks_init(struct pci_dev *dev); + +void xen_pcibk_config_field_free(struct config_field *field); + +int xen_pcibk_config_quirk_release(struct pci_dev *dev); + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg); + +#endif diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c new file mode 100644 index 000000000000..1d32a9a42c01 --- /dev/null +++ b/drivers/xen/xen-pciback/passthrough.c @@ -0,0 +1,194 @@ +/* + * PCI Backend - Provides restricted access to the real PCI bus topology + * to the frontend + * + * Author: Ryan Wilson + */ + +#include +#include +#include +#include "pciback.h" + +struct passthrough_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list; + spinlock_t lock; +}; + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + struct pci_dev *dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) + && bus == (unsigned int)dev_entry->dev->bus->number + && devfn == dev_entry->dev->devfn) { + dev = dev_entry->dev; + break; + } + } + + spin_unlock_irqrestore(&dev_data->lock, flags); + + return dev; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + unsigned long flags; + unsigned int domain, bus, devfn; + int err; + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) + return -ENOMEM; + dev_entry->dev = dev; + + spin_lock_irqsave(&dev_data->lock, flags); + list_add_tail(&dev_entry->list, &dev_data->dev_list); + spin_unlock_irqrestore(&dev_data->lock, flags); + + /* Publish this device. */ + domain = (unsigned int)pci_domain_nr(dev->bus); + bus = (unsigned int)dev->bus->number; + devfn = dev->devfn; + err = publish_cb(pdev, domain, bus, devfn, devid); + + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + if (dev_entry->dev == dev) { + list_del(&dev_entry->list); + found_dev = dev_entry->dev; + kfree(dev_entry); + } + } + + spin_unlock_irqrestore(&dev_data->lock, flags); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data; + + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + spin_lock_init(&dev_data->lock); + + INIT_LIST_HEAD(&dev_data->dev_list); + + pdev->pci_dev_data = dev_data; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_root_cb) +{ + int err = 0; + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *e, *tmp; + struct pci_dev *dev; + int found; + unsigned int domain, bus; + + spin_lock(&dev_data->lock); + + list_for_each_entry_safe(dev_entry, tmp, &dev_data->dev_list, list) { + /* Only publish this device as a root if none of its + * parent bridges are exported + */ + found = 0; + dev = dev_entry->dev->bus->self; + for (; !found && dev != NULL; dev = dev->bus->self) { + list_for_each_entry(e, &dev_data->dev_list, list) { + if (dev == e->dev) { + found = 1; + break; + } + } + } + + domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); + bus = (unsigned int)dev_entry->dev->bus->number; + + if (!found) { + spin_unlock(&dev_data->lock); + err = publish_root_cb(pdev, domain, bus); + if (err) + break; + spin_lock(&dev_data->lock); + } + } + + if (!err) + spin_unlock(&dev_data->lock); + + return err; +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + list_del(&dev_entry->list); + pcistub_put_pci_dev(dev_entry->dev); + kfree(dev_entry); + } + + kfree(dev_data); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + *domain = pci_domain_nr(pcidev->bus); + *bus = pcidev->bus->number; + *devfn = pcidev->devfn; + return 1; +} + +struct xen_pcibk_backend xen_pcibk_passthrough_backend = { + .name = "passthrough", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c new file mode 100644 index 000000000000..aec214ac0a14 --- /dev/null +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -0,0 +1,1376 @@ +/* + * PCI Stub Driver - Grabs devices in backend to be exported later + * + * Ryan Wilson + * Chris Bookholt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +#define DRV_NAME "xen-pciback" + +static char *pci_devs_to_hide; +wait_queue_head_t xen_pcibk_aer_wait_queue; +/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, +* We want to avoid in middle of AER ops, xen_pcibk devices is being removed +*/ +static DECLARE_RWSEM(pcistub_sem); +module_param_named(hide, pci_devs_to_hide, charp, 0444); + +struct pcistub_device_id { + struct list_head slot_list; + int domain; + unsigned char bus; + unsigned int devfn; +}; +static LIST_HEAD(pcistub_device_ids); +static DEFINE_SPINLOCK(device_ids_lock); + +struct pcistub_device { + struct kref kref; + struct list_head dev_list; + spinlock_t lock; + + struct pci_dev *dev; + struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +}; + +/* Access to pcistub_devices & seized_devices lists and the initialize_devices + * flag must be locked with pcistub_devices_lock + */ +static DEFINE_SPINLOCK(pcistub_devices_lock); +static LIST_HEAD(pcistub_devices); + +/* wait for device_initcall before initializing our devices + * (see pcistub_init_devices_late) + */ +static int initialize_devices; +static LIST_HEAD(seized_devices); + +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "pcistub_device_alloc\n"); + + psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); + if (!psdev) + return NULL; + + psdev->dev = pci_dev_get(dev); + if (!psdev->dev) { + kfree(psdev); + return NULL; + } + + kref_init(&psdev->kref); + spin_lock_init(&psdev->lock); + + return psdev; +} + +/* Don't call this directly as it's called by pcistub_device_put */ +static void pcistub_device_release(struct kref *kref) +{ + struct pcistub_device *psdev; + + psdev = container_of(kref, struct pcistub_device, kref); + + dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); + + xen_unregister_device_domain_owner(psdev->dev); + + /* Clean-up the device */ + xen_pcibk_reset_device(psdev->dev); + xen_pcibk_config_free_dyn_fields(psdev->dev); + xen_pcibk_config_free_dev(psdev->dev); + kfree(pci_get_drvdata(psdev->dev)); + pci_set_drvdata(psdev->dev, NULL); + + pci_dev_put(psdev->dev); + + kfree(psdev); +} + +static inline void pcistub_device_get(struct pcistub_device *psdev) +{ + kref_get(&psdev->kref); +} + +static inline void pcistub_device_put(struct pcistub_device *psdev) +{ + kref_put(&psdev->kref, pcistub_device_release); +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + pcistub_device_get(psdev); + goto out; + } + } + + /* didn't find it */ + psdev = NULL; + +out: + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return psdev; +} + +static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, + struct pcistub_device *psdev) +{ + struct pci_dev *pci_dev = NULL; + unsigned long flags; + + pcistub_device_get(psdev); + + spin_lock_irqsave(&psdev->lock, flags); + if (!psdev->pdev) { + psdev->pdev = pdev; + pci_dev = psdev->dev; + } + spin_unlock_irqrestore(&psdev->lock, flags); + + if (!pci_dev) + pcistub_device_put(psdev); + + return pci_dev; +} + +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +void pcistub_put_pci_dev(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /*hold this lock for avoiding breaking link between + * pcistub and xen_pcibk when AER is in processing + */ + down_write(&pcistub_sem); + /* Cleanup our device + * (so it's ready for the next domain) + */ + xen_pcibk_reset_device(found_psdev->dev); + xen_pcibk_config_free_dyn_fields(found_psdev->dev); + xen_pcibk_config_reset_dev(found_psdev->dev); + + spin_lock_irqsave(&found_psdev->lock, flags); + found_psdev->pdev = NULL; + spin_unlock_irqrestore(&found_psdev->lock, flags); + + pcistub_device_put(found_psdev); + up_write(&pcistub_sem); +} + +static int __devinit pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) +{ + /* Match the specified device by domain, bus, slot, func and also if + * any of the device's parent bridges match. + */ + for (; dev != NULL; dev = dev->bus->self) { + if (pci_domain_nr(dev->bus) == pdev_id->domain + && dev->bus->number == pdev_id->bus + && dev->devfn == pdev_id->devfn) + return 1; + + /* Sometimes topmost bridge links to itself. */ + if (dev == dev->bus->self) + break; + } + + return 0; +} + +static int __devinit pcistub_match(struct pci_dev *dev) +{ + struct pcistub_device_id *pdev_id; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { + if (pcistub_match_one(dev, pdev_id)) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return found; +} + +static int __devinit pcistub_init_device(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data; + int err = 0; + + dev_dbg(&dev->dev, "initializing...\n"); + + /* The PCI backend is not intended to be a module (or to work with + * removable PCI devices (yet). If it were, xen_pcibk_config_free() + * would need to be called somewhere to free the memory allocated + * here and then to call kfree(pci_get_drvdata(psdev->dev)). + */ + dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]") + + strlen(pci_name(dev)) + 1, GFP_ATOMIC); + if (!dev_data) { + err = -ENOMEM; + goto out; + } + pci_set_drvdata(dev, dev_data); + + /* + * Setup name for fake IRQ handler. It will only be enabled + * once the device is turned on by the guest. + */ + sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev)); + + dev_dbg(&dev->dev, "initializing config\n"); + + init_waitqueue_head(&xen_pcibk_aer_wait_queue); + err = xen_pcibk_config_init_dev(dev); + if (err) + goto out; + + /* HACK: Force device (& ACPI) to determine what IRQ it's on - we + * must do this here because pcibios_enable_device may specify + * the pci device's true irq (and possibly its other resources) + * if they differ from what's in the configuration space. + * This makes the assumption that the device's resources won't + * change after this point (otherwise this code may break!) + */ + dev_dbg(&dev->dev, "enabling device\n"); + err = pci_enable_device(dev); + if (err) + goto config_release; + + /* Now disable the device (this also ensures some private device + * data is setup before we export) + */ + dev_dbg(&dev->dev, "reset device\n"); + xen_pcibk_reset_device(dev); + + return 0; + +config_release: + xen_pcibk_config_free_dev(dev); + +out: + pci_set_drvdata(dev, NULL); + kfree(dev_data); + return err; +} + +/* + * Because some initialization still happens on + * devices during fs_initcall, we need to defer + * full initialization of our devices until + * device_initcall. + */ +static int __init pcistub_init_devices_late(void) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + pr_debug(DRV_NAME ": pcistub_init_devices_late\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + while (!list_empty(&seized_devices)) { + psdev = container_of(seized_devices.next, + struct pcistub_device, dev_list); + list_del(&psdev->dev_list); + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + err = pcistub_init_device(psdev->dev); + if (err) { + dev_err(&psdev->dev->dev, + "error %d initializing device\n", err); + kfree(psdev); + psdev = NULL; + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (psdev) + list_add_tail(&psdev->dev_list, &pcistub_devices); + } + + initialize_devices = 1; + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + return 0; +} + +static int __devinit pcistub_seize(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + psdev = pcistub_device_alloc(dev); + if (!psdev) + return -ENOMEM; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (initialize_devices) { + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* don't want irqs disabled when calling pcistub_init_device */ + err = pcistub_init_device(psdev->dev); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (!err) + list_add(&psdev->dev_list, &pcistub_devices); + } else { + dev_dbg(&dev->dev, "deferring initialization\n"); + list_add(&psdev->dev_list, &seized_devices); + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (err) + pcistub_device_put(psdev); + + return err; +} + +static int __devinit pcistub_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + int err = 0; + + dev_dbg(&dev->dev, "probing...\n"); + + if (pcistub_match(dev)) { + + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL + && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(&dev->dev, "can't export pci devices that " + "don't have a normal (0) or bridge (1) " + "header type!\n"); + err = -ENODEV; + goto out; + } + + dev_info(&dev->dev, "seizing device\n"); + err = pcistub_seize(dev); + } else + /* Didn't find the device */ + err = -ENODEV; + +out: + return err; +} + +static void pcistub_remove(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + dev_dbg(&dev->dev, "removing\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + xen_pcibk_config_quirk_release(dev); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (found_psdev) { + dev_dbg(&dev->dev, "found device to remove - in use? %p\n", + found_psdev->pdev); + + if (found_psdev->pdev) { + printk(KERN_WARNING DRV_NAME ": ****** removing device " + "%s while still in-use! ******\n", + pci_name(found_psdev->dev)); + printk(KERN_WARNING DRV_NAME ": ****** driver domain may" + " still access this device's i/o resources!\n"); + printk(KERN_WARNING DRV_NAME ": ****** shutdown driver " + "domain before binding device\n"); + printk(KERN_WARNING DRV_NAME ": ****** to other drivers " + "or domains\n"); + + xen_pcibk_release_pci_dev(found_psdev->pdev, + found_psdev->dev); + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_del(&found_psdev->dev_list); + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* the final put for releasing from the list */ + pcistub_device_put(found_psdev); + } +} + +static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = { + { + .vendor = PCI_ANY_ID, + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + }, + {0,}, +}; + +#define PCI_NODENAME_MAX 40 +static void kill_domain_by_device(struct pcistub_device *psdev) +{ + struct xenbus_transaction xbt; + int err; + char nodename[PCI_NODENAME_MAX]; + + if (!psdev) + dev_err(&psdev->dev->dev, + "device is NULL when do AER recovery/kill_domain\n"); + snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", + psdev->pdev->xdev->otherend_id); + nodename[strlen(nodename)] = '\0'; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + dev_err(&psdev->dev->dev, + "error %d when start xenbus transaction\n", err); + return; + } + /*PV AER handlers will set this flag*/ + xenbus_printf(xbt, nodename, "aerState" , "aerfail"); + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + dev_err(&psdev->dev->dev, + "error %d when end xenbus transaction\n", err); + return; + } +} + +/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and + * backend need to have cooperation. In xen_pcibk, those steps will do similar + * jobs: send service request and waiting for front_end response. +*/ +static pci_ers_result_t common_process(struct pcistub_device *psdev, + pci_channel_state_t state, int aer_cmd, + pci_ers_result_t result) +{ + pci_ers_result_t res = result; + struct xen_pcie_aer_op *aer_op; + int ret; + + /*with PV AER drivers*/ + aer_op = &(psdev->pdev->sh_info->aer_op); + aer_op->cmd = aer_cmd ; + /*useful for error_detected callback*/ + aer_op->err = state; + /*pcifront_end BDF*/ + ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, + &aer_op->domain, &aer_op->bus, &aer_op->devfn); + if (!ret) { + dev_err(&psdev->dev->dev, + DRV_NAME ": failed to get pcifront device\n"); + return PCI_ERS_RESULT_NONE; + } + wmb(); + + dev_dbg(&psdev->dev->dev, + DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", + aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); + /*local flag to mark there's aer request, xen_pcibk callback will use + * this flag to judge whether we need to check pci-front give aer + * service ack signal + */ + set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + /*It is possible that a pcifront conf_read_write ops request invokes + * the callback which cause the spurious execution of wake_up. + * Yet it is harmless and better than a spinlock here + */ + set_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + wmb(); + notify_remote_via_irq(psdev->pdev->evtchn_irq); + + ret = wait_event_timeout(xen_pcibk_aer_wait_queue, + !(test_bit(_XEN_PCIB_active, (unsigned long *) + &psdev->pdev->sh_info->flags)), 300*HZ); + + if (!ret) { + if (test_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&psdev->dev->dev, + "pcifront aer process not responding!\n"); + clear_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + aer_op->err = PCI_ERS_RESULT_NONE; + return res; + } + } + clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + if (test_bit(_XEN_PCIF_active, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_dbg(&psdev->dev->dev, + "schedule pci_conf service in xen_pcibk\n"); + xen_pcibk_test_and_schedule_op(psdev->pdev); + } + + res = (pci_ers_result_t)aer_op->err; + return res; +} + +/* +* xen_pcibk_slot_reset: it will send the slot_reset request to pcifront in case +* of the device driver could provide this service, and then wait for pcifront +* ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ +static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto release; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER slot_reset service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; + +} + + +/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto release; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER mmio_enabled service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_detected: it will send the error_detected request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* @error: the current PCI connection state +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, + pci_channel_state_t error) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_CAN_RECOVER; + dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + /*Guest owns the device yet no aer handler regiested, kill guest*/ + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER error_detected service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_resume: it will send the error_resume request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +*/ + +static void xen_pcibk_error_resume(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + kill_domain_by_device(psdev); + goto release; + } + common_process(psdev, 1, XEN_PCI_OP_aer_resume, + PCI_ERS_RESULT_RECOVERED); +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return; +} + +/*add xen_pcibk AER handling*/ +static struct pci_error_handlers xen_pcibk_error_handler = { + .error_detected = xen_pcibk_error_detected, + .mmio_enabled = xen_pcibk_mmio_enabled, + .slot_reset = xen_pcibk_slot_reset, + .resume = xen_pcibk_error_resume, +}; + +/* + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't + * for a normal device. I don't want it to be loaded automatically. + */ + +static struct pci_driver xen_pcibk_pci_driver = { + /* The name should be xen_pciback, but until the tools are updated + * we will keep it as pciback. */ + .name = "pciback", + .id_table = pcistub_ids, + .probe = pcistub_probe, + .remove = pcistub_remove, + .err_handler = &xen_pcibk_error_handler, +}; + +static inline int str_to_slot(const char *buf, int *domain, int *bus, + int *slot, int *func) +{ + int err; + + err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); + if (err == 4) + return 0; + else if (err < 0) + return -EINVAL; + + /* try again without domain */ + *domain = 0; + err = sscanf(buf, " %x:%x.%x", bus, slot, func); + if (err == 3) + return 0; + + return -EINVAL; +} + +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int + *slot, int *func, int *reg, int *size, int *mask) +{ + int err; + + err = + sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot, + func, reg, size, mask); + if (err == 7) + return 0; + return -EINVAL; +} + +static int pcistub_device_id_add(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id; + unsigned long flags; + + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); + if (!pci_dev_id) + return -ENOMEM; + + pci_dev_id->domain = domain; + pci_dev_id->bus = bus; + pci_dev_id->devfn = PCI_DEVFN(slot, func); + + pr_debug(DRV_NAME ": wants to seize %04x:%02x:%02x.%01x\n", + domain, bus, slot, func); + + spin_lock_irqsave(&device_ids_lock, flags); + list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); + spin_unlock_irqrestore(&device_ids_lock, flags); + + return 0; +} + +static int pcistub_device_id_remove(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id, *t; + int devfn = PCI_DEVFN(slot, func); + int err = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, + slot_list) { + if (pci_dev_id->domain == domain + && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { + /* Don't break; here because it's possible the same + * slot could be in the list more than once + */ + list_del(&pci_dev_id->slot_list); + kfree(pci_dev_id); + + err = 0; + + pr_debug(DRV_NAME ": removed %04x:%02x:%02x.%01x from " + "seize list\n", domain, bus, slot, func); + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return err; +} + +static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, + int size, int mask) +{ + int err = 0; + struct pcistub_device *psdev; + struct pci_dev *dev; + struct config_field *field; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev || !psdev->dev) { + err = -ENODEV; + goto out; + } + dev = psdev->dev; + + field = kzalloc(sizeof(*field), GFP_ATOMIC); + if (!field) { + err = -ENOMEM; + goto out; + } + + field->offset = reg; + field->size = size; + field->mask = mask; + field->init = NULL; + field->reset = NULL; + field->release = NULL; + field->clean = xen_pcibk_config_field_free; + + err = xen_pcibk_config_quirks_add_field(dev, field); + if (err) + kfree(field); +out: + return err; +} + +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_add(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} + +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); + +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_remove(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} + +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); + +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device_id *pci_dev_id; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { + if (count >= PAGE_SIZE) + break; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%04x:%02x:%02x.%01x\n", + pci_dev_id->domain, pci_dev_id->bus, + PCI_SLOT(pci_dev_id->devfn), + PCI_FUNC(pci_dev_id->devfn)); + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} + +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); + +static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, + "%s:%s:%sing:%ld\n", + pci_name(psdev->dev), + dev_data->isr_on ? "on" : "off", + dev_data->ack_intr ? "ack" : "not ack", + dev_data->handled); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} + +DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); + +static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, + const char *buf, + size_t count) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + int domain, bus, slot, func; + int err = -ENOENT; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + psdev = pcistub_device_find(domain, bus, slot, func); + + if (!psdev) + goto out; + + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) + goto out; + + dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", + dev_data->irq_name, dev_data->isr_on, + !dev_data->isr_on); + + dev_data->isr_on = !(dev_data->isr_on); + if (dev_data->isr_on) + dev_data->ack_intr = 1; +out: + if (!err) + err = count; + return err; +} +DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch); + +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func, reg, size, mask; + int err; + + err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, + &mask); + if (err) + goto out; + + err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); + +out: + if (!err) + err = count; + return err; +} + +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +{ + int count = 0; + unsigned long flags; + struct xen_pcibk_config_quirk *quirk; + struct xen_pcibk_dev_data *dev_data; + const struct config_field *field; + const struct config_field_entry *cfg_entry; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) { + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", + quirk->pdev->bus->number, + PCI_SLOT(quirk->pdev->devfn), + PCI_FUNC(quirk->pdev->devfn), + quirk->devid.vendor, quirk->devid.device, + quirk->devid.subvendor, + quirk->devid.subdevice); + + dev_data = pci_get_drvdata(quirk->pdev); + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "\t\t%08x:%01x:%08x\n", + cfg_entry->base_offset + + field->offset, field->size, + field->mask); + } + } + +out: + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} + +DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add); + +static ssize_t permissive_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + if (!psdev->dev) { + err = -ENODEV; + goto release; + } + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + if (!dev_data->permissive) { + dev_data->permissive = 1; + /* Let user know that what they're doing could be unsafe */ + dev_warn(&psdev->dev->dev, "enabling permissive mode " + "configuration space accesses!\n"); + dev_warn(&psdev->dev->dev, + "permissive mode is potentially unsafe!\n"); + } +release: + pcistub_device_put(psdev); +out: + if (!err) + err = count; + return err; +} + +static ssize_t permissive_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->permissive) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} + +DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add); + +static void pcistub_exit(void) +{ + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + pci_unregister_driver(&xen_pcibk_pci_driver); +} + +static int __init pcistub_init(void) +{ + int pos = 0; + int err = 0; + int domain, bus, slot, func; + int parsed; + + if (pci_devs_to_hide && *pci_devs_to_hide) { + do { + parsed = 0; + + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.%x) %n", + &domain, &bus, &slot, &func, &parsed); + if (err != 4) { + domain = 0; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x.%x) %n", + &bus, &slot, &func, &parsed); + if (err != 3) + goto parse_error; + } + + err = pcistub_device_id_add(domain, bus, slot, func); + if (err) + goto out; + + /* if parsed<=0, we've reached the end of the string */ + pos += parsed; + } while (parsed > 0 && pci_devs_to_hide[pos]); + } + + /* If we're the first PCI Device Driver to register, we're the + * first one to get offered PCI devices as they become + * available (and thus we can be the first to grab them) + */ + err = pci_register_driver(&xen_pcibk_pci_driver); + if (err < 0) + goto out; + + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_new_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_slots); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_quirks); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + if (err) + pcistub_exit(); + +out: + return err; + +parse_error: + printk(KERN_ERR DRV_NAME ": Error parsing pci_devs_to_hide at \"%s\"\n", + pci_devs_to_hide + pos); + return -EINVAL; +} + +#ifndef MODULE +/* + * fs_initcall happens before device_initcall + * so xen_pcibk *should* get called first (b/c we + * want to suck up any device before other drivers + * get a chance by being the first pci device + * driver to register) + */ +fs_initcall(pcistub_init); +#endif + +static int __init xen_pcibk_init(void) +{ + int err; + + if (!xen_initial_domain()) + return -ENODEV; + + err = xen_pcibk_config_init(); + if (err) + return err; + +#ifdef MODULE + err = pcistub_init(); + if (err < 0) + return err; +#endif + + pcistub_init_devices_late(); + err = xen_pcibk_xenbus_register(); + if (err) + pcistub_exit(); + + return err; +} + +static void __exit xen_pcibk_cleanup(void) +{ + xen_pcibk_xenbus_unregister(); + pcistub_exit(); +} + +module_init(xen_pcibk_init); +module_exit(xen_pcibk_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h new file mode 100644 index 000000000000..a0e131a81503 --- /dev/null +++ b/drivers/xen/xen-pciback/pciback.h @@ -0,0 +1,183 @@ +/* + * PCI Backend Common Data Structures & Function Declarations + * + * Author: Ryan Wilson + */ +#ifndef __XEN_PCIBACK_H__ +#define __XEN_PCIBACK_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct pci_dev_entry { + struct list_head list; + struct pci_dev *dev; +}; + +#define _PDEVF_op_active (0) +#define PDEVF_op_active (1<<(_PDEVF_op_active)) +#define _PCIB_op_pending (1) +#define PCIB_op_pending (1<<(_PCIB_op_pending)) + +struct xen_pcibk_device { + void *pci_dev_data; + spinlock_t dev_lock; + struct xenbus_device *xdev; + struct xenbus_watch be_watch; + u8 be_watching; + int evtchn_irq; + struct xen_pci_sharedinfo *sh_info; + unsigned long flags; + struct work_struct op_work; +}; + +struct xen_pcibk_dev_data { + struct list_head config_fields; + unsigned int permissive:1; + unsigned int warned_on_write:1; + unsigned int enable_intx:1; + unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ + unsigned int ack_intr:1; /* .. and ACK-ing */ + unsigned long handled; + unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ + char irq_name[0]; /* xen-pcibk[000:04:00.0] */ +}; + +/* Used by XenBus and xen_pcibk_ops.c */ +extern wait_queue_head_t xen_pcibk_aer_wait_queue; +extern struct workqueue_struct *xen_pcibk_wq; +/* Used by pcistub.c and conf_space_quirks.c */ +extern struct list_head xen_pcibk_quirks; + +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func); +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev); +void pcistub_put_pci_dev(struct pci_dev *dev); + +/* Ensure a device is turned off or reset */ +void xen_pcibk_reset_device(struct pci_dev *pdev); + +/* Access a virtual configuration space for a PCI device */ +int xen_pcibk_config_init(void); +int xen_pcibk_config_init_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev); +void xen_pcibk_config_reset_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dev(struct pci_dev *dev); +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val); +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, + u32 value); + +/* Handle requests for specific devices from the frontend */ +typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid); +typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus); + +/* Backend registration for the two types of BDF representation: + * vpci - BDFs start at 00 + * passthrough - BDFs are exactly like in the host. + */ +struct xen_pcibk_backend { + char *name; + int (*init)(struct xen_pcibk_device *pdev); + void (*free)(struct xen_pcibk_device *pdev); + int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn); + int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb); + void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev); + int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb); + struct pci_dev *(*get)(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn); +}; + +extern struct xen_pcibk_backend xen_pcibk_vpci_backend; +extern struct xen_pcibk_backend xen_pcibk_passthrough_backend; +extern struct xen_pcibk_backend *xen_pcibk_backend; + +static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, + publish_pci_dev_cb publish_cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->add) + return xen_pcibk_backend->add(pdev, dev, devid, publish_cb); + return -1; +}; +static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->free) + return xen_pcibk_backend->release(pdev, dev); +}; + +static inline struct pci_dev * +xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, + unsigned int bus, unsigned int devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->get) + return xen_pcibk_backend->get(pdev, domain, bus, devfn); + return NULL; +}; +/** +* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk +* before sending aer request to pcifront, so that guest could identify +* device, coopearte with xen_pcibk to finish aer recovery job if device driver +* has the capability +*/ +static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, + unsigned int *bus, + unsigned int *devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->find) + return xen_pcibk_backend->find(pcidev, pdev, domain, bus, + devfn); + return -1; +}; +static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->init) + return xen_pcibk_backend->init(pdev); + return -1; +}; +static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->publish) + return xen_pcibk_backend->publish(pdev, cb); + return -1; +}; +static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->free) + return xen_pcibk_backend->free(pdev); +}; +/* Handles events from front-end */ +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); +void xen_pcibk_do_op(struct work_struct *data); + +int xen_pcibk_xenbus_register(void); +void xen_pcibk_xenbus_unregister(void); + +extern int verbose_request; + +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); +#endif + +/* Handles shared IRQs that can to device domain and control domain. */ +void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c new file mode 100644 index 000000000000..8c95c3415b75 --- /dev/null +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -0,0 +1,384 @@ +/* + * PCI Backend Operations - respond to PCI requests from Frontend + * + * Author: Ryan Wilson + */ +#include +#include +#include +#include +#include +#include "pciback.h" + +#define DRV_NAME "xen-pciback" +int verbose_request; +module_param(verbose_request, int, 0644); + +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); + +/* Ensure a device is has the fake IRQ handler "turned on/off" and is + * ready to be exported. This MUST be run after xen_pcibk_reset_device + * which does the actual PCI device enable/disable. + */ +static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) +{ + struct xen_pcibk_dev_data *dev_data; + int rc; + int enable = 0; + + dev_data = pci_get_drvdata(dev); + if (!dev_data) + return; + + /* We don't deal with bridges */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return; + + if (reset) { + dev_data->enable_intx = 0; + dev_data->ack_intr = 0; + } + enable = dev_data->enable_intx; + + /* Asked to disable, but ISR isn't runnig */ + if (!enable && !dev_data->isr_on) + return; + + /* Squirrel away the IRQs in the dev_data. We need this + * b/c when device transitions to MSI, the dev->irq is + * overwritten with the MSI vector. + */ + if (enable) + dev_data->irq = dev->irq; + + /* + * SR-IOV devices in all use MSI-X and have no legacy + * interrupts, so inhibit creating a fake IRQ handler for them. + */ + if (dev_data->irq == 0) + goto out; + + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + dev_data->isr_on ? "enable" : "disable", + enable ? "enable" : "disable"); + + if (enable) { + rc = request_irq(dev_data->irq, + xen_pcibk_guest_interrupt, IRQF_SHARED, + dev_data->irq_name, dev); + if (rc) { + dev_err(&dev->dev, "%s: failed to install fake IRQ " \ + "handler for IRQ %d! (rc:%d)\n", + dev_data->irq_name, dev_data->irq, rc); + goto out; + } + } else { + free_irq(dev_data->irq, dev); + dev_data->irq = 0; + } + dev_data->isr_on = enable; + dev_data->ack_intr = enable; +out: + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + enable ? (dev_data->isr_on ? "enabled" : "failed to enable") : + (dev_data->isr_on ? "failed to disable" : "disabled")); +} + +/* Ensure a device is "turned off" and ready to be exported. + * (Also see xen_pcibk_config_reset to ensure virtual configuration space is + * ready to be re-exported) + */ +void xen_pcibk_reset_device(struct pci_dev *dev) +{ + u16 cmd; + + xen_pcibk_control_isr(dev, 1 /* reset device */); + + /* Disable devices (but not bridges) */ + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { +#ifdef CONFIG_PCI_MSI + /* The guest could have been abruptly killed without + * disabling MSI/MSI-X interrupts.*/ + if (dev->msix_enabled) + pci_disable_msix(dev); + if (dev->msi_enabled) + pci_disable_msi(dev); +#endif + pci_disable_device(dev); + + pci_write_config_word(dev, PCI_COMMAND, 0); + + dev->is_busmaster = 0; + } else { + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & (PCI_COMMAND_INVALIDATE)) { + cmd &= ~(PCI_COMMAND_INVALIDATE); + pci_write_config_word(dev, PCI_COMMAND, cmd); + + dev->is_busmaster = 0; + } + } +} + +#ifdef CONFIG_PCI_MSI +static +int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int otherend = pdev->xdev->otherend_id; + int status; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); + + status = pci_enable_msi(dev); + + if (status) { + printk(KERN_ERR "error enable msi for guest %x status %x\n", + otherend, status); + op->value = 0; + return XEN_PCI_ERR_op_failed; + } + + /* The value the guest needs is actually the IDT vector, not the + * the local domain's IRQ number. */ + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return 0; +} + +static +int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", + pci_name(dev)); + pci_disable_msi(dev); + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} + +static +int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int i, result; + struct msix_entry *entries; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", + pci_name(dev)); + if (op->value > SH_INFO_MAX_VEC) + return -EINVAL; + + entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); + if (entries == NULL) + return -ENOMEM; + + for (i = 0; i < op->value; i++) { + entries[i].entry = op->msix_entries[i].entry; + entries[i].vector = op->msix_entries[i].vector; + } + + result = pci_enable_msix(dev, entries, op->value); + + if (result == 0) { + for (i = 0; i < op->value; i++) { + op->msix_entries[i].entry = entries[i].entry; + if (entries[i].vector) + op->msix_entries[i].vector = + xen_pirq_from_irq(entries[i].vector); + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: " \ + "MSI-X[%d]: %d\n", + pci_name(dev), i, + op->msix_entries[i].vector); + } + } else { + printk(KERN_WARNING DRV_NAME ": %s: failed to enable MSI-X: err %d!\n", + pci_name(dev), result); + } + kfree(entries); + + op->value = result; + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return result; +} + +static +int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", + pci_name(dev)); + pci_disable_msix(dev); + + /* + * SR-IOV devices (which don't have any legacy IRQ) have + * an undefined IRQ value of zero. + */ + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} +#endif +/* +* Now the same evtchn is used for both pcifront conf_read_write request +* as well as pcie aer front end ack. We use a new work_queue to schedule +* xen_pcibk conf_read_write service for avoiding confict with aer_core +* do_recovery job which also use the system default work_queue +*/ +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +{ + /* Check that frontend is requesting an operation and that we are not + * already processing a request */ + if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) + && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { + queue_work(xen_pcibk_wq, &pdev->op_work); + } + /*_XEN_PCIB_active should have been cleared by pcifront. And also make + sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ + if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) + && test_bit(_PCIB_op_pending, &pdev->flags)) { + wake_up(&xen_pcibk_aer_wait_queue); + } +} + +/* Performing the configuration space reads/writes must not be done in atomic + * context because some of the pci_* functions can sleep (mostly due to ACPI + * use of semaphores). This function is intended to be called from a work + * queue in process context taking a struct xen_pcibk_device as a parameter */ + +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data = NULL; + struct xen_pci_op *op = &pdev->sh_info->op; + int test_intx = 0; + + dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); + + if (dev == NULL) + op->err = XEN_PCI_ERR_dev_not_found; + else { + dev_data = pci_get_drvdata(dev); + if (dev_data) + test_intx = dev_data->enable_intx; + switch (op->cmd) { + case XEN_PCI_OP_conf_read: + op->err = xen_pcibk_config_read(dev, + op->offset, op->size, &op->value); + break; + case XEN_PCI_OP_conf_write: + op->err = xen_pcibk_config_write(dev, + op->offset, op->size, op->value); + break; +#ifdef CONFIG_PCI_MSI + case XEN_PCI_OP_enable_msi: + op->err = xen_pcibk_enable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msi: + op->err = xen_pcibk_disable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_enable_msix: + op->err = xen_pcibk_enable_msix(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msix: + op->err = xen_pcibk_disable_msix(pdev, dev, op); + break; +#endif + default: + op->err = XEN_PCI_ERR_not_implemented; + break; + } + } + if (!op->err && dev && dev_data) { + /* Transition detected */ + if ((dev_data->enable_intx != test_intx)) + xen_pcibk_control_isr(dev, 0 /* no reset */); + } + /* Tell the driver domain that we're done. */ + wmb(); + clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_irq(pdev->evtchn_irq); + + /* Mark that we're done. */ + smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + clear_bit(_PDEVF_op_active, &pdev->flags); + smp_mb__after_clear_bit(); /* /before/ final check for work */ + + /* Check to see if the driver domain tried to start another request in + * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. + */ + xen_pcibk_test_and_schedule_op(pdev); +} + +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) +{ + struct xen_pcibk_device *pdev = dev_id; + + xen_pcibk_test_and_schedule_op(pdev); + + return IRQ_HANDLED; +} +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) +{ + struct pci_dev *dev = (struct pci_dev *)dev_id; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + if (dev_data->isr_on && dev_data->ack_intr) { + dev_data->handled++; + if ((dev_data->handled % 1000) == 0) { + if (xen_test_irq_shared(irq)) { + printk(KERN_INFO "%s IRQ line is not shared " + "with other domains. Turning ISR off\n", + dev_data->irq_name); + dev_data->ack_intr = 0; + } + } + return IRQ_HANDLED; + } + return IRQ_NONE; +} diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c new file mode 100644 index 000000000000..4a42cfb0959d --- /dev/null +++ b/drivers/xen/xen-pciback/vpci.c @@ -0,0 +1,259 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + * to the frontend + * + * Author: Ryan Wilson + */ + +#include +#include +#include +#include +#include "pciback.h" + +#define PCI_SLOT_MAX 32 +#define DRV_NAME "xen-pciback" + +struct vpci_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list[PCI_SLOT_MAX]; + spinlock_t lock; +}; + +static inline struct list_head *list_first(struct list_head *head) +{ + return head->next; +} + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + + if (domain != 0 || bus != 0) + return NULL; + + if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { + spin_lock_irqsave(&vpci_dev->lock, flags); + + list_for_each_entry(entry, + &vpci_dev->dev_list[PCI_SLOT(devfn)], + list) { + if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { + dev = entry->dev; + break; + } + } + + spin_unlock_irqrestore(&vpci_dev->lock, flags); + } + return dev; +} + +static inline int match_slot(struct pci_dev *l, struct pci_dev *r) +{ + if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) + && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) + return 1; + + return 0; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, int devid, + publish_pci_dev_cb publish_cb) +{ + int err = 0, slot, func = -1; + struct pci_dev_entry *t, *dev_entry; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { + err = -EFAULT; + xenbus_dev_fatal(pdev->xdev, err, + "Can't export bridges on the virtual PCI bus"); + goto out; + } + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Error adding entry to virtual PCI bus"); + goto out; + } + + dev_entry->dev = dev; + + spin_lock_irqsave(&vpci_dev->lock, flags); + + /* Keep multi-function devices together on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (!list_empty(&vpci_dev->dev_list[slot])) { + t = list_entry(list_first(&vpci_dev->dev_list[slot]), + struct pci_dev_entry, list); + + if (match_slot(dev, t->dev)) { + pr_info(DRV_NAME ": vpci: %s: " + "assign to virtual slot %d func %d\n", + pci_name(dev), slot, + PCI_FUNC(dev->devfn)); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + } + + /* Assign to a new slot on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) { + printk(KERN_INFO DRV_NAME + ": vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "No more space on root virtual PCI bus"); + +unlock: + spin_unlock_irqrestore(&vpci_dev->lock, flags); + + /* Publish this device. */ + if (!err) + err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + +out: + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&vpci_dev->lock, flags); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + if (e->dev == dev) { + list_del(&e->list); + found_dev = e->dev; + kfree(e); + goto out; + } + } + } + +out: + spin_unlock_irqrestore(&vpci_dev->lock, flags); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev; + + vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); + if (!vpci_dev) + return -ENOMEM; + + spin_lock_init(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) + INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); + + pdev->pci_dev_data = vpci_dev; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_cb) +{ + /* The Virtual PCI bus has only one root */ + return publish_cb(pdev, 0, 0); +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + list_del(&e->list); + pcistub_put_pci_dev(e->dev); + kfree(e); + } + } + + kfree(vpci_dev); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + int found = 0, slot; + + spin_lock_irqsave(&vpci_dev->lock, flags); + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + list_for_each_entry(entry, + &vpci_dev->dev_list[slot], + list) { + dev = entry->dev; + if (dev && dev->bus->number == pcidev->bus->number + && pci_domain_nr(dev->bus) == + pci_domain_nr(pcidev->bus) + && dev->devfn == pcidev->devfn) { + found = 1; + *domain = 0; + *bus = 0; + *devfn = PCI_DEVFN(slot, + PCI_FUNC(pcidev->devfn)); + } + } + } + spin_unlock_irqrestore(&vpci_dev->lock, flags); + return found; +} + +struct xen_pcibk_backend xen_pcibk_vpci_backend = { + .name = "vpci", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c new file mode 100644 index 000000000000..206c4ce030bc --- /dev/null +++ b/drivers/xen/xen-pciback/xenbus.c @@ -0,0 +1,749 @@ +/* + * PCI Backend Xenbus Setup - handles setup with frontend and xend + * + * Author: Ryan Wilson + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pciback.h" + +#define DRV_NAME "xen-pciback" +#define INVALID_EVTCHN_IRQ (-1) +struct workqueue_struct *xen_pcibk_wq; + +static int __read_mostly passthrough; +module_param(passthrough, bool, S_IRUGO); +MODULE_PARM_DESC(passthrough, + "Option to specify how to export PCI topology to guest:\n"\ + " 0 - (default) Hide the true PCI topology and makes the frontend\n"\ + " there is a single PCI bus with only the exported devices on it.\n"\ + " For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\ + " while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\ + " 1 - Passthrough provides a real view of the PCI topology to the\n"\ + " frontend (for example, a device at 06:01.b will still appear at\n"\ + " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ + " exposed PCI devices to its driver domains. This may be required\n"\ + " for drivers which depend on finding their hardward in certain\n"\ + " bus/slot locations."); + +static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) +{ + struct xen_pcibk_device *pdev; + + pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL); + if (pdev == NULL) + goto out; + dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); + + pdev->xdev = xdev; + dev_set_drvdata(&xdev->dev, pdev); + + spin_lock_init(&pdev->dev_lock); + + pdev->sh_info = NULL; + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + pdev->be_watching = 0; + + INIT_WORK(&pdev->op_work, xen_pcibk_do_op); + + if (xen_pcibk_init_devices(pdev)) { + kfree(pdev); + pdev = NULL; + } +out: + return pdev; +} + +static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) +{ + spin_lock(&pdev->dev_lock); + + /* Ensure the guest can't trigger our handler before removing devices */ + if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { + unbind_from_irqhandler(pdev->evtchn_irq, pdev); + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + } + spin_unlock(&pdev->dev_lock); + + /* If the driver domain started an op, make sure we complete it + * before releasing the shared memory */ + + /* Note, the workqueue does not use spinlocks at all.*/ + flush_workqueue(xen_pcibk_wq); + + spin_lock(&pdev->dev_lock); + if (pdev->sh_info != NULL) { + xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); + pdev->sh_info = NULL; + } + spin_unlock(&pdev->dev_lock); + +} + +static void free_pdev(struct xen_pcibk_device *pdev) +{ + if (pdev->be_watching) { + unregister_xenbus_watch(&pdev->be_watch); + pdev->be_watching = 0; + } + + xen_pcibk_disconnect(pdev); + + xen_pcibk_release_devices(pdev); + + dev_set_drvdata(&pdev->xdev->dev, NULL); + pdev->xdev = NULL; + + kfree(pdev); +} + +static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, + int remote_evtchn) +{ + int err = 0; + void *vaddr; + + dev_dbg(&pdev->xdev->dev, + "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + gnt_ref, remote_evtchn); + + err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error mapping other domain page in ours."); + goto out; + } + + spin_lock(&pdev->dev_lock); + pdev->sh_info = vaddr; + spin_unlock(&pdev->dev_lock); + + err = bind_interdomain_evtchn_to_irqhandler( + pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + 0, DRV_NAME, pdev); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error binding event channel to IRQ"); + goto out; + } + + spin_lock(&pdev->dev_lock); + pdev->evtchn_irq = err; + spin_unlock(&pdev->dev_lock); + err = 0; + + dev_dbg(&pdev->xdev->dev, "Attached!\n"); +out: + return err; +} + +static int xen_pcibk_attach(struct xen_pcibk_device *pdev) +{ + int err = 0; + int gnt_ref, remote_evtchn; + char *magic = NULL; + + + /* Make sure we only do this setup once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + goto out; + + /* Wait for frontend to state that it has published the configuration */ + if (xenbus_read_driver_state(pdev->xdev->otherend) != + XenbusStateInitialised) + goto out; + + dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); + + err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, + "pci-op-ref", "%u", &gnt_ref, + "event-channel", "%u", &remote_evtchn, + "magic", NULL, &magic, NULL); + if (err) { + /* If configuration didn't get read correctly, wait longer */ + xenbus_dev_fatal(pdev->xdev, err, + "Error reading configuration from frontend"); + goto out; + } + + if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { + xenbus_dev_fatal(pdev->xdev, -EFAULT, + "version mismatch (%s/%s) with pcifront - " + "halting xen_pcibk", + magic, XEN_PCI_MAGIC); + goto out; + } + + err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn); + if (err) + goto out; + + dev_dbg(&pdev->xdev->dev, "Connecting...\n"); + + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to connected state!"); + + dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); +out: + + kfree(magic); + + return err; +} + +static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid) +{ + int err; + int len; + char str[64]; + + len = snprintf(str, sizeof(str), "vdev-%d", devid); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x:%02x.%02x", domain, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + +out: + return err; +} + +static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func, + int devid) +{ + struct pci_dev *dev; + int err = 0; + + dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); + if (!dev) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Couldn't locate PCI device " + "(%04x:%02x:%02x.%01x)! " + "perhaps already in-use?", + domain, bus, slot, func); + goto out; + } + + err = xen_pcibk_add_pci_dev(pdev, dev, devid, + xen_pcibk_publish_pci_dev); + if (err) + goto out; + + dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); + if (xen_register_device_domain_owner(dev, + pdev->xdev->otherend_id) != 0) { + dev_err(&dev->dev, "device has been assigned to another " \ + "domain! Over-writting the ownership, but beware.\n"); + xen_unregister_device_domain_owner(dev); + xen_register_device_domain_owner(dev, pdev->xdev->otherend_id); + } + + /* TODO: It'd be nice to export a bridge and have all of its children + * get exported with it. This may be best done in xend (which will + * have to calculate resource usage anyway) but we probably want to + * put something in here to ensure that if a bridge gets given to a + * driver domain, that all devices under that bridge are not given + * to other driver domains (as he who controls the bridge can disable + * it and stop the other devices from working). + */ +out: + return err; +} + +static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func) +{ + int err = 0; + struct pci_dev *dev; + + dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); + if (!dev) { + err = -EINVAL; + dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " + "(%04x:%02x:%02x.%01x)! not owned by this domain\n", + domain, bus, slot, func); + goto out; + } + + dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); + xen_unregister_device_domain_owner(dev); + + xen_pcibk_release_pci_dev(pdev, dev); + +out: + return err; +} + +static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus) +{ + unsigned int d, b; + int i, root_num, len, err; + char str[64]; + + dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", &root_num); + if (err == 0 || err == -ENOENT) + root_num = 0; + else if (err < 0) + goto out; + + /* Verify that we haven't already published this pci root */ + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + str, "%x:%x", &d, &b); + if (err < 0) + goto out; + if (err != 2) { + err = -EINVAL; + goto out; + } + + if (d == domain && b == bus) { + err = 0; + goto out; + } + } + + len = snprintf(str, sizeof(str), "root-%d", root_num); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", + root_num, domain, bus); + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x", domain, bus); + if (err) + goto out; + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", (root_num + 1)); + +out: + return err; +} + +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +{ + int err = 0; + int num_devs; + int domain, bus, slot, func; + int substate; + int i, len; + char state_str[64]; + char dev_str[64]; + + + dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + + /* Make sure we only reconfigure once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateReconfiguring) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + len = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(len >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", &substate); + if (err != 1) + substate = XenbusStateUnknown; + + switch (substate) { + case XenbusStateInitialising: + dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, + func, i); + if (err) + goto out; + + /* Publish pci roots. */ + err = xen_pcibk_publish_pci_roots(pdev, + xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root" + "buses for frontend"); + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + state_str, "%d", + XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching substate of " + "dev-%d\n", i); + goto out; + } + break; + + case XenbusStateClosing: + dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_remove_device(pdev, domain, bus, slot, + func); + if (err) + goto out; + + /* TODO: If at some point we implement support for pci + * root hot-remove on pcifront side, we'll need to + * remove unnecessary xenstore nodes of pci roots here. + */ + + break; + + default: + break; + } + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to reconfigured state!"); + goto out; + } + +out: + return 0; +} + +static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, + enum xenbus_state fe_state) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev); + + dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); + + switch (fe_state) { + case XenbusStateInitialised: + xen_pcibk_attach(pdev); + break; + + case XenbusStateReconfiguring: + xen_pcibk_reconfigure(pdev); + break; + + case XenbusStateConnected: + /* pcifront switched its state from reconfiguring to connected. + * Then switch to connected state. + */ + xenbus_switch_state(xdev, XenbusStateConnected); + break; + + case XenbusStateClosing: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosed); + if (xenbus_dev_is_online(xdev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); + device_unregister(&xdev->dev); + break; + + default: + break; + } +} + +static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) +{ + /* Get configuration from xend (if available now) */ + int domain, bus, slot, func; + int err = 0; + int i, num_devs; + char dev_str[64]; + char state_str[64]; + + /* It's possible we could get the call to setup twice, so make sure + * we're not already connected. + */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitWait) + goto out; + + dev_dbg(&pdev->xdev->dev, "getting be setup\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(l >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, + "%x:%x:%x.%x", &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i); + if (err) + goto out; + + /* Switch substate of this device. */ + l = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(l >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, "Error switching " + "substate of dev-%d\n", i); + goto out; + } + } + + err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root buses " + "for frontend"); + goto out; + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to initialised state!"); + +out: + if (!err) + /* see if pcifront is already configured (if not, we'll wait) */ + xen_pcibk_attach(pdev); + + return err; +} + +static void xen_pcibk_be_watch(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct xen_pcibk_device *pdev = + container_of(watch, struct xen_pcibk_device, be_watch); + + switch (xenbus_read_driver_state(pdev->xdev->nodename)) { + case XenbusStateInitWait: + xen_pcibk_setup_backend(pdev); + break; + + default: + break; + } +} + +static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct xen_pcibk_device *pdev = alloc_pdev(dev); + + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, + "Error allocating xen_pcibk_device struct"); + goto out; + } + + /* wait for xend to configure us */ + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto out; + + /* watch the backend node for backend configuration information */ + err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, + xen_pcibk_be_watch); + if (err) + goto out; + + pdev->be_watching = 1; + + /* We need to force a call to our callback here in case + * xend already configured us! + */ + xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + +out: + return err; +} + +static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); + + if (pdev != NULL) + free_pdev(pdev); + + return 0; +} + +static const struct xenbus_device_id xenpci_ids[] = { + {"pci"}, + {""}, +}; + +static struct xenbus_driver xenbus_xen_pcibk_driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + .ids = xenpci_ids, + .probe = xen_pcibk_xenbus_probe, + .remove = xen_pcibk_xenbus_remove, + .otherend_changed = xen_pcibk_frontend_changed, +}; + +struct xen_pcibk_backend *xen_pcibk_backend; + +int __init xen_pcibk_xenbus_register(void) +{ + xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); + if (!xen_pcibk_wq) { + printk(KERN_ERR "%s: create" + "xen_pciback_workqueue failed\n", __func__); + return -EFAULT; + } + xen_pcibk_backend = &xen_pcibk_vpci_backend; + if (passthrough) + xen_pcibk_backend = &xen_pcibk_passthrough_backend; + pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name); + return xenbus_register_backend(&xenbus_xen_pcibk_driver); +} + +void __exit xen_pcibk_xenbus_unregister(void) +{ + destroy_workqueue(xen_pcibk_wq); + xenbus_unregister_driver(&xenbus_xen_pcibk_driver); +} diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c new file mode 100644 index 000000000000..010937b5a7c9 --- /dev/null +++ b/drivers/xen/xen-selfballoon.c @@ -0,0 +1,485 @@ +/****************************************************************************** + * Xen selfballoon driver (and optional frontswap self-shrinking driver) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * This code complements the cleancache and frontswap patchsets to optimize + * support for Xen Transcendent Memory ("tmem"). The policy it implements + * is rudimentary and will likely improve over time, but it does work well + * enough today. + * + * Two functionalities are implemented here which both use "control theory" + * (feedback) to optimize memory utilization. In a virtualized environment + * such as Xen, RAM is often a scarce resource and we would like to ensure + * that each of a possibly large number of virtual machines is using RAM + * efficiently, i.e. using as little as possible when under light load + * and obtaining as much as possible when memory demands are high. + * Since RAM needs vary highly dynamically and sometimes dramatically, + * "hysteresis" is used, that is, memory target is determined not just + * on current data but also on past data stored in the system. + * + * "Selfballooning" creates memory pressure by managing the Xen balloon + * driver to decrease and increase available kernel memory, driven + * largely by the target value of "Committed_AS" (see /proc/meminfo). + * Since Committed_AS does not account for clean mapped pages (i.e. pages + * in RAM that are identical to pages on disk), selfballooning has the + * affect of pushing less frequently used clean pagecache pages out of + * kernel RAM and, presumably using cleancache, into Xen tmem where + * Xen can more efficiently optimize RAM utilization for such pages. + * + * When kernel memory demand unexpectedly increases faster than Xen, via + * the selfballoon driver, is able to (or chooses to) provide usable RAM, + * the kernel may invoke swapping. In most cases, frontswap is able + * to absorb this swapping into Xen tmem. However, due to the fact + * that the kernel swap subsystem assumes swapping occurs to a disk, + * swapped pages may sit on the disk for a very long time; even if + * the kernel knows the page will never be used again. This is because + * the disk space costs very little and can be overwritten when + * necessary. When such stale pages are in frontswap, however, they + * are taking up valuable real estate. "Frontswap selfshrinking" works + * to resolve this: When frontswap activity is otherwise stable + * and the guest kernel is not under memory pressure, the "frontswap + * selfshrinking" accounts for this by providing pressure to remove some + * pages from frontswap and return them to kernel memory. + * + * For both "selfballooning" and "frontswap-selfshrinking", a worker + * thread is used and sysfs tunables are provided to adjust the frequency + * and rate of adjustments to achieve the goal, as well as to disable one + * or both functions independently. + * + * While some argue that this functionality can and should be implemented + * in userspace, it has been observed that bad things happen (e.g. OOMs). + * + * System configuration note: Selfballooning should not be enabled on + * systems without a sufficiently large swap device configured; for best + * results, it is recommended that total swap be increased by the size + * of the guest memory. Also, while technically not required to be + * configured, it is highly recommended that frontswap also be configured + * and enabled when selfballooning is running. So, selfballooning + * is disabled by default if frontswap is not configured and can only + * be enabled with the "selfballooning" kernel boot option; similarly + * selfballooning is enabled by default if frontswap is configured and + * can be disabled with the "noselfballooning" kernel boot option. Finally, + * when frontswap is configured, frontswap-selfshrinking can be disabled + * with the "noselfshrink" kernel boot option. + * + * Selfballooning is disallowed in domain0 and force-disabled. + * + */ + +#include +#include +#include + +#include + +#include + +/* Enable/disable with sysfs. */ +static int xen_selfballooning_enabled __read_mostly; + +/* + * Controls rate at which memory target (this iteration) approaches + * ultimate goal when memory need is increasing (up-hysteresis) or + * decreasing (down-hysteresis). Higher values of hysteresis cause + * slower increases/decreases. The default values for the various + * parameters were deemed reasonable by experimentation, may be + * workload-dependent, and can all be adjusted via sysfs. + */ +static unsigned int selfballoon_downhysteresis __read_mostly = 8; +static unsigned int selfballoon_uphysteresis __read_mostly = 1; + +/* In HZ, controls frequency of worker invocation. */ +static unsigned int selfballoon_interval __read_mostly = 5; + +static void selfballoon_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); + +#ifdef CONFIG_FRONTSWAP +#include + +/* Enable/disable with sysfs. */ +static bool frontswap_selfshrinking __read_mostly; + +/* Enable/disable with kernel boot option. */ +static bool use_frontswap_selfshrink __initdata = true; + +/* + * The default values for the following parameters were deemed reasonable + * by experimentation, may be workload-dependent, and can all be + * adjusted via sysfs. + */ + +/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ +static unsigned int frontswap_hysteresis __read_mostly = 20; + +/* + * Number of selfballoon worker invocations to wait before observing that + * frontswap selfshrinking should commence. Note that selfshrinking does + * not use a separate worker thread. + */ +static unsigned int frontswap_inertia __read_mostly = 3; + +/* Countdown to next invocation of frontswap_shrink() */ +static unsigned long frontswap_inertia_counter; + +/* + * Invoked by the selfballoon worker thread, uses current number of pages + * in frontswap (frontswap_curr_pages()), previous status, and control + * values (hysteresis and inertia) to determine if frontswap should be + * shrunk and what the new frontswap size should be. Note that + * frontswap_shrink is essentially a partial swapoff that immediately + * transfers pages from the "swap device" (frontswap) back into kernel + * RAM; despite the name, frontswap "shrinking" is very different from + * the "shrinker" interface used by the kernel MM subsystem to reclaim + * memory. + */ +static void frontswap_selfshrink(void) +{ + static unsigned long cur_frontswap_pages; + static unsigned long last_frontswap_pages; + static unsigned long tgt_frontswap_pages; + + last_frontswap_pages = cur_frontswap_pages; + cur_frontswap_pages = frontswap_curr_pages(); + if (!cur_frontswap_pages || + (cur_frontswap_pages > last_frontswap_pages)) { + frontswap_inertia_counter = frontswap_inertia; + return; + } + if (frontswap_inertia_counter && --frontswap_inertia_counter) + return; + if (cur_frontswap_pages <= frontswap_hysteresis) + tgt_frontswap_pages = 0; + else + tgt_frontswap_pages = cur_frontswap_pages - + (cur_frontswap_pages / frontswap_hysteresis); + frontswap_shrink(tgt_frontswap_pages); +} + +static int __init xen_nofrontswap_selfshrink_setup(char *s) +{ + use_frontswap_selfshrink = false; + return 1; +} + +__setup("noselfshrink", xen_nofrontswap_selfshrink_setup); + +/* Disable with kernel boot option. */ +static bool use_selfballooning __initdata = true; + +static int __init xen_noselfballooning_setup(char *s) +{ + use_selfballooning = false; + return 1; +} + +__setup("noselfballooning", xen_noselfballooning_setup); +#else /* !CONFIG_FRONTSWAP */ +/* Enable with kernel boot option. */ +static bool use_selfballooning __initdata = false; + +static int __init xen_selfballooning_setup(char *s) +{ + use_selfballooning = true; + return 1; +} + +__setup("selfballooning", xen_selfballooning_setup); +#endif /* CONFIG_FRONTSWAP */ + +/* + * Use current balloon size, the goal (vm_committed_as), and hysteresis + * parameters to set a new target balloon size + */ +static void selfballoon_process(struct work_struct *work) +{ + unsigned long cur_pages, goal_pages, tgt_pages; + bool reset_timer = false; + + if (xen_selfballooning_enabled) { + cur_pages = balloon_stats.current_pages; + tgt_pages = cur_pages; /* default is no change */ + goal_pages = percpu_counter_read_positive(&vm_committed_as) + + balloon_stats.current_pages - totalram_pages; +#ifdef CONFIG_FRONTSWAP + /* allow space for frontswap pages to be repatriated */ + if (frontswap_selfshrinking && frontswap_enabled) + goal_pages += frontswap_curr_pages(); +#endif + if (cur_pages > goal_pages) + tgt_pages = cur_pages - + ((cur_pages - goal_pages) / + selfballoon_downhysteresis); + else if (cur_pages < goal_pages) + tgt_pages = cur_pages + + ((goal_pages - cur_pages) / + selfballoon_uphysteresis); + /* else if cur_pages == goal_pages, no change */ + balloon_set_new_target(tgt_pages); + reset_timer = true; + } +#ifdef CONFIG_FRONTSWAP + if (frontswap_selfshrinking && frontswap_enabled) { + frontswap_selfshrink(); + reset_timer = true; + } +#endif + if (reset_timer) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); +} + +#ifdef CONFIG_SYSFS + +#include +#include + +#define SELFBALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } + +SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); + +static ssize_t store_selfballooning(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = xen_selfballooning_enabled; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = strict_strtoul(buf, 10, &tmp); + if (err || ((tmp != 0) && (tmp != 1))) + return -EINVAL; + + xen_selfballooning_enabled = !!tmp; + if (!was_enabled && xen_selfballooning_enabled) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static SYSDEV_ATTR(selfballooning, S_IRUGO | S_IWUSR, + show_selfballooning, store_selfballooning); + +SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); + +static ssize_t store_selfballoon_interval(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_interval = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, + show_selfballoon_interval, store_selfballoon_interval); + +SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); + +static ssize_t store_selfballoon_downhys(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_downhysteresis = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_downhys, store_selfballoon_downhys); + + +SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); + +static ssize_t store_selfballoon_uphys(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_uphysteresis = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_uphys, store_selfballoon_uphys); + +#ifdef CONFIG_FRONTSWAP +SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); + +static ssize_t store_frontswap_selfshrinking(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = frontswap_selfshrinking; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &tmp); + if (err || ((tmp != 0) && (tmp != 1))) + return -EINVAL; + frontswap_selfshrinking = !!tmp; + if (!was_enabled && !xen_selfballooning_enabled && + frontswap_selfshrinking) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static SYSDEV_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, + show_frontswap_selfshrinking, store_frontswap_selfshrinking); + +SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); + +static ssize_t store_frontswap_inertia(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + frontswap_inertia = val; + frontswap_inertia_counter = val; + return count; +} + +static SYSDEV_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, + show_frontswap_inertia, store_frontswap_inertia); + +SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); + +static ssize_t store_frontswap_hysteresis(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + frontswap_hysteresis = val; + return count; +} + +static SYSDEV_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, + show_frontswap_hysteresis, store_frontswap_hysteresis); + +#endif /* CONFIG_FRONTSWAP */ + +static struct attribute *selfballoon_attrs[] = { + &attr_selfballooning.attr, + &attr_selfballoon_interval.attr, + &attr_selfballoon_downhysteresis.attr, + &attr_selfballoon_uphysteresis.attr, +#ifdef CONFIG_FRONTSWAP + &attr_frontswap_selfshrinking.attr, + &attr_frontswap_hysteresis.attr, + &attr_frontswap_inertia.attr, +#endif + NULL +}; + +static struct attribute_group selfballoon_group = { + .name = "selfballoon", + .attrs = selfballoon_attrs +}; +#endif + +int register_xen_selfballooning(struct sys_device *sysdev) +{ + int error = -1; + +#ifdef CONFIG_SYSFS + error = sysfs_create_group(&sysdev->kobj, &selfballoon_group); +#endif + return error; +} +EXPORT_SYMBOL(register_xen_selfballooning); + +static int __init xen_selfballoon_init(void) +{ + bool enable = false; + + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain()) { + pr_info("xen/balloon: Xen selfballooning driver " + "disabled for domain0.\n"); + return -ENODEV; + } + + xen_selfballooning_enabled = tmem_enabled && use_selfballooning; + if (xen_selfballooning_enabled) { + pr_info("xen/balloon: Initializing Xen " + "selfballooning driver.\n"); + enable = true; + } +#ifdef CONFIG_FRONTSWAP + frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; + if (frontswap_selfshrinking) { + pr_info("xen/balloon: Initializing frontswap " + "selfshrinking driver.\n"); + enable = true; + } +#endif + if (!enable) + return -ENODEV; + + schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); + + return 0; +} + +subsys_initcall(xen_selfballoon_init); + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 739769551e33..bd2f90c9ac8b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -378,26 +378,32 @@ static void xenbus_dev_release(struct device *dev) kfree(to_xenbus_device(dev)); } -static ssize_t xendev_show_nodename(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodename_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } -static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); -static ssize_t xendev_show_devtype(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t devtype_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } -static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); -static ssize_t xendev_show_modalias(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t modalias_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); + return sprintf(buf, "%s:%s\n", dev->bus->name, + to_xenbus_device(dev)->devicetype); } -static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); + +struct device_attribute xenbus_dev_attrs[] = { + __ATTR_RO(nodename), + __ATTR_RO(devtype), + __ATTR_RO(modalias), + __ATTR_NULL +}; +EXPORT_SYMBOL_GPL(xenbus_dev_attrs); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -449,25 +455,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, if (err) goto fail; - err = device_create_file(&xendev->dev, &dev_attr_nodename); - if (err) - goto fail_unregister; - - err = device_create_file(&xendev->dev, &dev_attr_devtype); - if (err) - goto fail_remove_nodename; - - err = device_create_file(&xendev->dev, &dev_attr_modalias); - if (err) - goto fail_remove_devtype; - return 0; -fail_remove_devtype: - device_remove_file(&xendev->dev, &dev_attr_devtype); -fail_remove_nodename: - device_remove_file(&xendev->dev, &dev_attr_nodename); -fail_unregister: - device_unregister(&xendev->dev); fail: kfree(xendev); return err; diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 888b9900ca08..b814935378c7 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -48,6 +48,8 @@ struct xen_bus_type struct bus_type bus; }; +extern struct device_attribute xenbus_dev_attrs[]; + extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); extern int xenbus_dev_remove(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 6cf467bf63ec..60adf919d78d 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -107,6 +107,9 @@ static int xenbus_uevent_backend(struct device *dev, if (xdev == NULL) return -ENODEV; + if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) + return -ENOMEM; + /* stuff we want to pass to /sbin/hotplug */ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) return -ENOMEM; @@ -183,10 +186,6 @@ static void frontend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 0); } -static struct device_attribute xenbus_backend_dev_attrs[] = { - __ATTR_NULL -}; - static struct xen_bus_type xenbus_backend = { .root = "backend", .levels = 3, /* backend/type// */ @@ -200,7 +199,7 @@ static struct xen_bus_type xenbus_backend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_backend_dev_attrs, + .dev_attrs = xenbus_dev_attrs, }, }; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index b6a2690c9d49..ed2ba474a560 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -81,10 +81,6 @@ static void backend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 1); } -static struct device_attribute xenbus_frontend_dev_attrs[] = { - __ATTR_NULL -}; - static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, .resume = xenbus_dev_resume, @@ -106,7 +102,7 @@ static struct xen_bus_type xenbus_frontend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_frontend_dev_attrs, + .dev_attrs = xenbus_dev_attrs, .pm = &xenbus_pm_ops, }, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bb71471a4d9d..a9b4a24f2a16 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1737,7 +1737,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, io_parms.pid = pid; io_parms.tcon = pTcon; io_parms.offset = *poffset; - io_parms.length = len; + io_parms.length = cur_len; rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &read_data, &buf_type); pSMBr = (struct smb_com_read_rsp *)read_data; diff --git a/fs/dcache.c b/fs/dcache.c index 6e4ea6d87774..fbdcbca40725 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1813,8 +1813,6 @@ seqretry: tname = dentry->d_name.name; i = dentry->d_inode; prefetch(tname); - if (i) - prefetch(i); /* * This seqcount check is required to ensure name and * len are loaded atomically, so as not to walk off the diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index abc49f292454..90e5997262ea 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -14,17 +14,9 @@ #include "dlm_internal.h" #include "lock.h" #include "user.h" -#include "ast.h" - -#define WAKE_ASTS 0 - -static uint64_t ast_seq_count; -static struct list_head ast_queue; -static spinlock_t ast_queue_lock; -static struct task_struct * astd_task; -static unsigned long astd_wakeflags; -static struct mutex astd_running; +static uint64_t dlm_cb_seq; +static spinlock_t dlm_cb_seq_spin; static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) { @@ -57,21 +49,13 @@ static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) } } -void dlm_del_ast(struct dlm_lkb *lkb) -{ - spin_lock(&ast_queue_lock); - if (!list_empty(&lkb->lkb_astqueue)) - list_del_init(&lkb->lkb_astqueue); - spin_unlock(&ast_queue_lock); -} - int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; uint64_t prev_seq; int prev_mode; - int i; + int i, rv; for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { if (lkb->lkb_callbacks[i].seq) @@ -100,7 +84,8 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, mode, (unsigned long long)prev_seq, prev_mode); - return 0; + rv = 0; + goto out; } } @@ -109,6 +94,7 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, lkb->lkb_callbacks[i].mode = mode; lkb->lkb_callbacks[i].sb_status = status; lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); + rv = 0; break; } @@ -117,21 +103,24 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, lkb->lkb_id, (unsigned long long)seq, flags, mode, status, sbflags); dlm_dump_lkb_callbacks(lkb); - return -1; + rv = -1; + goto out; } - - return 0; + out: + return rv; } int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_callback *cb, int *resid) { - int i; + int i, rv; *resid = 0; - if (!lkb->lkb_callbacks[0].seq) - return -ENOENT; + if (!lkb->lkb_callbacks[0].seq) { + rv = -ENOENT; + goto out; + } /* oldest undelivered cb is callbacks[0] */ @@ -163,7 +152,8 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, cb->mode, (unsigned long long)lkb->lkb_last_cast.seq, lkb->lkb_last_cast.mode); - return 0; + rv = 0; + goto out; } } @@ -176,171 +166,150 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); lkb->lkb_last_bast_time = ktime_get(); } - - return 0; + rv = 0; + out: + return rv; } -void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, - uint32_t sbflags) +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) { - uint64_t seq; + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t new_seq, prev_seq; int rv; - spin_lock(&ast_queue_lock); - - seq = ++ast_seq_count; + spin_lock(&dlm_cb_seq_spin); + new_seq = ++dlm_cb_seq; + spin_unlock(&dlm_cb_seq_spin); if (lkb->lkb_flags & DLM_IFL_USER) { - spin_unlock(&ast_queue_lock); - dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq); + dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); return; } - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); - if (rv < 0) { - spin_unlock(&ast_queue_lock); - return; - } + mutex_lock(&lkb->lkb_cb_mutex); + prev_seq = lkb->lkb_callbacks[0].seq; - if (list_empty(&lkb->lkb_astqueue)) { + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); + if (rv < 0) + goto out; + + if (!prev_seq) { kref_get(&lkb->lkb_ref); - list_add_tail(&lkb->lkb_astqueue, &ast_queue); - } - spin_unlock(&ast_queue_lock); - set_bit(WAKE_ASTS, &astd_wakeflags); - wake_up_process(astd_task); + if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { + mutex_lock(&ls->ls_cb_mutex); + list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); + mutex_unlock(&ls->ls_cb_mutex); + } else { + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + } + } + out: + mutex_unlock(&lkb->lkb_cb_mutex); } -static void process_asts(void) +void dlm_callback_work(struct work_struct *work) { - struct dlm_ls *ls = NULL; - struct dlm_rsb *r = NULL; - struct dlm_lkb *lkb; + struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work); + struct dlm_ls *ls = lkb->lkb_resource->res_ls; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; int i, rv, resid; -repeat: - spin_lock(&ast_queue_lock); - list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { - r = lkb->lkb_resource; - ls = r->res_ls; + memset(&callbacks, 0, sizeof(callbacks)); - if (dlm_locking_stopped(ls)) - continue; - - /* we remove from astqueue list and remove everything in - lkb_callbacks before releasing the spinlock so empty - lkb_astqueue is always consistent with empty lkb_callbacks */ - - list_del_init(&lkb->lkb_astqueue); - - castfn = lkb->lkb_astfn; - bastfn = lkb->lkb_bastfn; + mutex_lock(&lkb->lkb_cb_mutex); + if (!lkb->lkb_callbacks[0].seq) { + /* no callback work exists, shouldn't happen */ + log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } - memset(&callbacks, 0, sizeof(callbacks)); + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); + if (rv < 0) + break; + } - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); - if (rv < 0) - break; - } - spin_unlock(&ast_queue_lock); + if (resid) { + /* cbs remain, loop should have removed all, shouldn't happen */ + log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, + resid); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } + mutex_unlock(&lkb->lkb_cb_mutex); - if (resid) { - /* shouldn't happen, for loop should have removed all */ - log_error(ls, "callback resid %d lkb %x", - resid, lkb->lkb_id); - } + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (!callbacks[i].seq) - break; - if (callbacks[i].flags & DLM_CB_SKIP) { - continue; - } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); - } else if (callbacks[i].flags & DLM_CB_CAST) { - lkb->lkb_lksb->sb_status = callbacks[i].sb_status; - lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; - castfn(lkb->lkb_astparam); - } + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (!callbacks[i].seq) + break; + if (callbacks[i].flags & DLM_CB_SKIP) { + continue; + } else if (callbacks[i].flags & DLM_CB_BAST) { + bastfn(lkb->lkb_astparam, callbacks[i].mode); + } else if (callbacks[i].flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = callbacks[i].sb_status; + lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + castfn(lkb->lkb_astparam); } - - /* removes ref for ast_queue, may cause lkb to be freed */ - dlm_put_lkb(lkb); - - cond_resched(); - goto repeat; } - spin_unlock(&ast_queue_lock); -} - -static inline int no_asts(void) -{ - int ret; - spin_lock(&ast_queue_lock); - ret = list_empty(&ast_queue); - spin_unlock(&ast_queue_lock); - return ret; + /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ + dlm_put_lkb(lkb); } -static int dlm_astd(void *data) +int dlm_callback_start(struct dlm_ls *ls) { - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!test_bit(WAKE_ASTS, &astd_wakeflags)) - schedule(); - set_current_state(TASK_RUNNING); - - mutex_lock(&astd_running); - if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) - process_asts(); - mutex_unlock(&astd_running); + ls->ls_callback_wq = alloc_workqueue("dlm_callback", + WQ_UNBOUND | + WQ_MEM_RECLAIM | + WQ_NON_REENTRANT, + 0); + if (!ls->ls_callback_wq) { + log_print("can't start dlm_callback workqueue"); + return -ENOMEM; } return 0; } -void dlm_astd_wake(void) +void dlm_callback_stop(struct dlm_ls *ls) { - if (!no_asts()) { - set_bit(WAKE_ASTS, &astd_wakeflags); - wake_up_process(astd_task); - } + if (ls->ls_callback_wq) + destroy_workqueue(ls->ls_callback_wq); } -int dlm_astd_start(void) +void dlm_callback_suspend(struct dlm_ls *ls) { - struct task_struct *p; - int error = 0; - - INIT_LIST_HEAD(&ast_queue); - spin_lock_init(&ast_queue_lock); - mutex_init(&astd_running); - - p = kthread_run(dlm_astd, NULL, "dlm_astd"); - if (IS_ERR(p)) - error = PTR_ERR(p); - else - astd_task = p; - return error; -} + set_bit(LSFL_CB_DELAY, &ls->ls_flags); -void dlm_astd_stop(void) -{ - kthread_stop(astd_task); + if (ls->ls_callback_wq) + flush_workqueue(ls->ls_callback_wq); } -void dlm_astd_suspend(void) +void dlm_callback_resume(struct dlm_ls *ls) { - mutex_lock(&astd_running); -} + struct dlm_lkb *lkb, *safe; + int count = 0; -void dlm_astd_resume(void) -{ - mutex_unlock(&astd_running); + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + + if (!ls->ls_callback_wq) + return; + + mutex_lock(&ls->ls_cb_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { + list_del_init(&lkb->lkb_cb_list); + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + count++; + } + mutex_unlock(&ls->ls_cb_mutex); + + log_debug(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 8aa89c9b5611..757b551c6820 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -18,14 +18,15 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq); int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_callback *cb, int *resid); -void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, - uint32_t sbflags); +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); -void dlm_astd_wake(void); -int dlm_astd_start(void); -void dlm_astd_stop(void); -void dlm_astd_suspend(void); -void dlm_astd_resume(void); +void dlm_callback_work(struct work_struct *work); +int dlm_callback_start(struct dlm_ls *ls); +void dlm_callback_stop(struct dlm_ls *ls); +void dlm_callback_suspend(struct dlm_ls *ls); +void dlm_callback_resume(struct dlm_ls *ls); #endif + diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 9b026ea8baa9..6cf72fcc0d0c 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -28,7 +28,8 @@ * /config/dlm//spaces//nodes//weight * /config/dlm//comms//nodeid * /config/dlm//comms//local - * /config/dlm//comms//addr + * /config/dlm//comms//addr (write only) + * /config/dlm//comms//addr_list (read only) * The level is useless, but I haven't figured out how to avoid it. */ @@ -80,6 +81,7 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, size_t len); static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len); +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf); static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len); @@ -92,7 +94,6 @@ struct dlm_cluster { unsigned int cl_tcp_port; unsigned int cl_buffer_size; unsigned int cl_rsbtbl_size; - unsigned int cl_lkbtbl_size; unsigned int cl_dirtbl_size; unsigned int cl_recover_timer; unsigned int cl_toss_secs; @@ -101,13 +102,13 @@ struct dlm_cluster { unsigned int cl_protocol; unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; + unsigned int cl_new_rsb_count; }; enum { CLUSTER_ATTR_TCP_PORT = 0, CLUSTER_ATTR_BUFFER_SIZE, CLUSTER_ATTR_RSBTBL_SIZE, - CLUSTER_ATTR_LKBTBL_SIZE, CLUSTER_ATTR_DIRTBL_SIZE, CLUSTER_ATTR_RECOVER_TIMER, CLUSTER_ATTR_TOSS_SECS, @@ -116,6 +117,7 @@ enum { CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, + CLUSTER_ATTR_NEW_RSB_COUNT, }; struct cluster_attribute { @@ -160,7 +162,6 @@ __CONFIGFS_ATTR(name, 0644, name##_read, name##_write) CLUSTER_ATTR(tcp_port, 1); CLUSTER_ATTR(buffer_size, 1); CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(lkbtbl_size, 1); CLUSTER_ATTR(dirtbl_size, 1); CLUSTER_ATTR(recover_timer, 1); CLUSTER_ATTR(toss_secs, 1); @@ -169,12 +170,12 @@ CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); +CLUSTER_ATTR(new_rsb_count, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, - [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr, [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, @@ -183,6 +184,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, + [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, NULL, }; @@ -190,6 +192,7 @@ enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, COMM_ATTR_ADDR, + COMM_ATTR_ADDR_LIST, }; struct comm_attribute { @@ -217,14 +220,22 @@ static struct comm_attribute comm_attr_local = { static struct comm_attribute comm_attr_addr = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "addr", - .ca_mode = S_IRUGO | S_IWUSR }, + .ca_mode = S_IWUSR }, .store = comm_addr_write, }; +static struct comm_attribute comm_attr_addr_list = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr_list", + .ca_mode = S_IRUGO }, + .show = comm_addr_list_read, +}; + static struct configfs_attribute *comm_attrs[] = { [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, [COMM_ATTR_LOCAL] = &comm_attr_local.attr, [COMM_ATTR_ADDR] = &comm_attr_addr.attr, + [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr, NULL, }; @@ -435,7 +446,6 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_tcp_port = dlm_config.ci_tcp_port; cl->cl_buffer_size = dlm_config.ci_buffer_size; cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; - cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size; cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; cl->cl_recover_timer = dlm_config.ci_recover_timer; cl->cl_toss_secs = dlm_config.ci_toss_secs; @@ -444,6 +454,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; + cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -720,6 +731,50 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) return len; } +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf) +{ + ssize_t s; + ssize_t allowance; + int i; + struct sockaddr_storage *addr; + struct sockaddr_in *addr_in; + struct sockaddr_in6 *addr_in6; + + /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ + char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; + + + /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ + allowance = 4096; + buf[0] = '\0'; + + for (i = 0; i < cm->addr_count; i++) { + addr = cm->addr[i]; + + switch(addr->ss_family) { + case AF_INET: + addr_in = (struct sockaddr_in *)addr; + s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); + break; + case AF_INET6: + addr_in6 = (struct sockaddr_in6 *)addr; + s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); + break; + default: + s = sprintf(buf0, "%s\n", ""); + break; + } + allowance -= s; + if (allowance >= 0) + strcat(buf, buf0); + else { + allowance += s; + break; + } + } + return 4096 - allowance; +} + static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, char *buf) { @@ -983,7 +1038,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TCP_PORT 21064 #define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 -#define DEFAULT_LKBTBL_SIZE 1024 #define DEFAULT_DIRTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 @@ -992,12 +1046,12 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 +#define DEFAULT_NEW_RSB_COUNT 128 struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, .ci_buffer_size = DEFAULT_BUFFER_SIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, - .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE, .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, @@ -1005,6 +1059,7 @@ struct dlm_config_info dlm_config = { .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US + .ci_waitwarn_us = DEFAULT_WAITWARN_US, + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index dd0ce24d5a80..3099d0dd26c0 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -20,7 +20,6 @@ struct dlm_config_info { int ci_tcp_port; int ci_buffer_size; int ci_rsbtbl_size; - int ci_lkbtbl_size; int ci_dirtbl_size; int ci_recover_timer; int ci_toss_secs; @@ -29,6 +28,7 @@ struct dlm_config_info { int ci_protocol; int ci_timewarn_cs; int ci_waitwarn_us; + int ci_new_rsb_count; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 0262451eb9c6..fe2860c02449 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,6 @@ struct dlm_ls; struct dlm_lkb; struct dlm_rsb; struct dlm_member; -struct dlm_lkbtable; struct dlm_rsbtable; struct dlm_dirtable; struct dlm_direntry; @@ -108,11 +108,6 @@ struct dlm_rsbtable { spinlock_t lock; }; -struct dlm_lkbtable { - struct list_head list; - rwlock_t lock; - uint16_t counter; -}; /* * Lockspace member (per node in a ls) @@ -248,17 +243,18 @@ struct dlm_lkb { int8_t lkb_wait_count; int lkb_wait_nodeid; /* for debugging */ - struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ - struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; ktime_t lkb_timestamp; ktime_t lkb_wait_time; unsigned long lkb_timeout_cs; + struct mutex lkb_cb_mutex; + struct work_struct lkb_cb_work; + struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; struct dlm_callback lkb_last_cast; struct dlm_callback lkb_last_bast; @@ -299,7 +295,7 @@ struct dlm_rsb { int res_recover_locks_count; char *res_lvbptr; - char res_name[1]; + char res_name[DLM_RESNAME_MAXLEN+1]; }; /* find_rsb() flags */ @@ -465,12 +461,12 @@ struct dlm_ls { unsigned long ls_scan_time; struct kobject ls_kobj; + struct idr ls_lkbidr; + spinlock_t ls_lkbidr_spin; + struct dlm_rsbtable *ls_rsbtbl; uint32_t ls_rsbtbl_size; - struct dlm_lkbtable *ls_lkbtbl; - uint32_t ls_lkbtbl_size; - struct dlm_dirtable *ls_dirtbl; uint32_t ls_dirtbl_size; @@ -483,6 +479,10 @@ struct dlm_ls { struct mutex ls_timeout_mutex; struct list_head ls_timeout; + spinlock_t ls_new_rsb_spin; + int ls_new_rsb_count; + struct list_head ls_new_rsb; /* new rsb structs */ + struct list_head ls_nodes; /* current nodes in ls */ struct list_head ls_nodes_gone; /* dead node list, recovery */ int ls_num_nodes; /* number of nodes in ls */ @@ -506,8 +506,12 @@ struct dlm_ls { struct miscdevice ls_device; + struct workqueue_struct *ls_callback_wq; + /* recovery related */ + struct mutex ls_cb_mutex; + struct list_head ls_cb_delay; /* save for queue_work later */ struct timer_list ls_timer; struct task_struct *ls_recoverd_task; struct mutex ls_recoverd_active; @@ -544,6 +548,7 @@ struct dlm_ls { #define LSFL_RCOM_WAIT 4 #define LSFL_UEVENT_WAIT 5 #define LSFL_TIMEWARN 6 +#define LSFL_CB_DELAY 7 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f71d0b5abd95..83b5e32514e1 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -305,7 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) rv = -EDEADLK; } - dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); + dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -319,7 +319,7 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) if (is_master_copy(lkb)) { send_bast(r, lkb, rqmode); } else { - dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0); + dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0); } } @@ -327,19 +327,68 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) * Basic operations on rsb's and lkb's */ -static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) +static int pre_rsb_struct(struct dlm_ls *ls) +{ + struct dlm_rsb *r1, *r2; + int count = 0; + + spin_lock(&ls->ls_new_rsb_spin); + if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) { + spin_unlock(&ls->ls_new_rsb_spin); + return 0; + } + spin_unlock(&ls->ls_new_rsb_spin); + + r1 = dlm_allocate_rsb(ls); + r2 = dlm_allocate_rsb(ls); + + spin_lock(&ls->ls_new_rsb_spin); + if (r1) { + list_add(&r1->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + if (r2) { + list_add(&r2->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + + if (!count) + return -ENOMEM; + return 0; +} + +/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can + unlock any spinlocks, go back and call pre_rsb_struct again. + Otherwise, take an rsb off the list and return it. */ + +static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, + struct dlm_rsb **r_ret) { struct dlm_rsb *r; + int count; - r = dlm_allocate_rsb(ls, len); - if (!r) - return NULL; + spin_lock(&ls->ls_new_rsb_spin); + if (list_empty(&ls->ls_new_rsb)) { + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + log_debug(ls, "find_rsb retry %d %d %s", + count, dlm_config.ci_new_rsb_count, name); + return -EAGAIN; + } + + r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); + list_del(&r->res_hashchain); + ls->ls_new_rsb_count--; + spin_unlock(&ls->ls_new_rsb_spin); r->res_ls = ls; r->res_length = len; memcpy(r->res_name, name, len); mutex_init(&r->res_mutex); + INIT_LIST_HEAD(&r->res_hashchain); INIT_LIST_HEAD(&r->res_lookup); INIT_LIST_HEAD(&r->res_grantqueue); INIT_LIST_HEAD(&r->res_convertqueue); @@ -347,7 +396,8 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) INIT_LIST_HEAD(&r->res_root_list); INIT_LIST_HEAD(&r->res_recover_list); - return r; + *r_ret = r; + return 0; } static int search_rsb_list(struct list_head *head, char *name, int len, @@ -405,16 +455,6 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, return error; } -static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, - unsigned int flags, struct dlm_rsb **r_ret) -{ - int error; - spin_lock(&ls->ls_rsbtbl[b].lock); - error = _search_rsb(ls, name, len, b, flags, r_ret); - spin_unlock(&ls->ls_rsbtbl[b].lock); - return error; -} - /* * Find rsb in rsbtbl and potentially create/add one * @@ -432,35 +472,48 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, static int find_rsb(struct dlm_ls *ls, char *name, int namelen, unsigned int flags, struct dlm_rsb **r_ret) { - struct dlm_rsb *r = NULL, *tmp; + struct dlm_rsb *r = NULL; uint32_t hash, bucket; - int error = -EINVAL; + int error; - if (namelen > DLM_RESNAME_MAXLEN) + if (namelen > DLM_RESNAME_MAXLEN) { + error = -EINVAL; goto out; + } if (dlm_no_directory(ls)) flags |= R_CREATE; - error = 0; hash = jhash(name, namelen, 0); bucket = hash & (ls->ls_rsbtbl_size - 1); - error = search_rsb(ls, name, namelen, bucket, flags, &r); + retry: + if (flags & R_CREATE) { + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + } + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + + error = _search_rsb(ls, name, namelen, bucket, flags, &r); if (!error) - goto out; + goto out_unlock; if (error == -EBADR && !(flags & R_CREATE)) - goto out; + goto out_unlock; /* the rsb was found but wasn't a master copy */ if (error == -ENOTBLK) - goto out; + goto out_unlock; - error = -ENOMEM; - r = create_rsb(ls, name, namelen); - if (!r) - goto out; + error = get_rsb_struct(ls, name, namelen, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + goto retry; + } + if (error) + goto out_unlock; r->res_hash = hash; r->res_bucket = bucket; @@ -474,18 +527,10 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, nodeid = 0; r->res_nodeid = nodeid; } - - spin_lock(&ls->ls_rsbtbl[bucket].lock); - error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); - if (!error) { - spin_unlock(&ls->ls_rsbtbl[bucket].lock); - dlm_free_rsb(r); - r = tmp; - goto out; - } list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); - spin_unlock(&ls->ls_rsbtbl[bucket].lock); error = 0; + out_unlock: + spin_unlock(&ls->ls_rsbtbl[bucket].lock); out: *r_ret = r; return error; @@ -580,9 +625,8 @@ static void detach_lkb(struct dlm_lkb *lkb) static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { - struct dlm_lkb *lkb, *tmp; - uint32_t lkid = 0; - uint16_t bucket; + struct dlm_lkb *lkb; + int rv, id; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -594,60 +638,42 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); INIT_LIST_HEAD(&lkb->lkb_time_list); - INIT_LIST_HEAD(&lkb->lkb_astqueue); + INIT_LIST_HEAD(&lkb->lkb_cb_list); + mutex_init(&lkb->lkb_cb_mutex); + INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - get_random_bytes(&bucket, sizeof(bucket)); - bucket &= (ls->ls_lkbtbl_size - 1); - - write_lock(&ls->ls_lkbtbl[bucket].lock); + retry: + rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); + if (!rv) + return -ENOMEM; - /* counter can roll over so we must verify lkid is not in use */ + spin_lock(&ls->ls_lkbidr_spin); + rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); + if (!rv) + lkb->lkb_id = id; + spin_unlock(&ls->ls_lkbidr_spin); - while (lkid == 0) { - lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++; + if (rv == -EAGAIN) + goto retry; - list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list, - lkb_idtbl_list) { - if (tmp->lkb_id != lkid) - continue; - lkid = 0; - break; - } + if (rv < 0) { + log_error(ls, "create_lkb idr error %d", rv); + return rv; } - lkb->lkb_id = lkid; - list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list); - write_unlock(&ls->ls_lkbtbl[bucket].lock); - *lkb_ret = lkb; return 0; } -static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid) -{ - struct dlm_lkb *lkb; - uint16_t bucket = (lkid >> 16); - - list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) { - if (lkb->lkb_id == lkid) - return lkb; - } - return NULL; -} - static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - uint16_t bucket = (lkid >> 16); - - if (bucket >= ls->ls_lkbtbl_size) - return -EBADSLT; - read_lock(&ls->ls_lkbtbl[bucket].lock); - lkb = __find_lkb(ls, lkid); + spin_lock(&ls->ls_lkbidr_spin); + lkb = idr_find(&ls->ls_lkbidr, lkid); if (lkb) kref_get(&lkb->lkb_ref); - read_unlock(&ls->ls_lkbtbl[bucket].lock); + spin_unlock(&ls->ls_lkbidr_spin); *lkb_ret = lkb; return lkb ? 0 : -ENOENT; @@ -668,12 +694,12 @@ static void kill_lkb(struct kref *kref) static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) { - uint16_t bucket = (lkb->lkb_id >> 16); + uint32_t lkid = lkb->lkb_id; - write_lock(&ls->ls_lkbtbl[bucket].lock); + spin_lock(&ls->ls_lkbidr_spin); if (kref_put(&lkb->lkb_ref, kill_lkb)) { - list_del(&lkb->lkb_idtbl_list); - write_unlock(&ls->ls_lkbtbl[bucket].lock); + idr_remove(&ls->ls_lkbidr, lkid); + spin_unlock(&ls->ls_lkbidr_spin); detach_lkb(lkb); @@ -683,7 +709,7 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) dlm_free_lkb(lkb); return 1; } else { - write_unlock(&ls->ls_lkbtbl[bucket].lock); + spin_unlock(&ls->ls_lkbidr_spin); return 0; } } @@ -849,9 +875,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (!num_nodes) { num_nodes = ls->ls_num_nodes; - warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); - if (warned) - memset(warned, 0, num_nodes * sizeof(int)); + warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); } if (!warned) continue; @@ -863,9 +887,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); } mutex_unlock(&ls->ls_waiters_mutex); - - if (warned) - kfree(warned); + kfree(warned); if (debug_expired) log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", @@ -2401,9 +2423,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) if (deadlk) { /* it's left on the granted queue */ - log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s", - lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status, - lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name); revert_lock(r, lkb); queue_cast(r, lkb, -EDEADLK); error = -EDEADLK; @@ -3993,8 +4012,6 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) default: log_error(ls, "unknown message type %d", ms->m_type); } - - dlm_astd_wake(); } /* If the lockspace is in recovery mode (locking stopped), then normal @@ -4133,7 +4150,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; - ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message)); + ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); if (!ms_stub) { log_error(ls, "dlm_recover_waiters_pre no mem"); return; @@ -4809,7 +4826,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, goto out_put; spin_lock(&ua->proc->locks_spin); - /* dlm_user_add_ast() may have already taken lkb off the proc list */ + /* dlm_user_add_cb() may have already taken lkb off the proc list */ if (!list_empty(&lkb->lkb_ownqueue)) list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); @@ -4946,7 +4963,7 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) /* We have to release clear_proc_locks mutex before calling unlock_proc_lock() (which does lock_rsb) due to deadlock with receiving a message that does - lock_rsb followed by dlm_user_add_ast() */ + lock_rsb followed by dlm_user_add_cb() */ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, struct dlm_user_proc *proc) @@ -4969,7 +4986,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, return lkb; } -/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which +/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, which we clear here. */ @@ -5011,10 +5028,10 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { memset(&lkb->lkb_callbacks, 0, sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -5053,10 +5070,10 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_unlock(&proc->locks_spin); spin_lock(&proc->asts_spin); - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { memset(&lkb->lkb_callbacks, 0, sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } spin_unlock(&proc->asts_spin); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 14cbf4099753..a1d8f1af144b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -15,7 +15,6 @@ #include "lockspace.h" #include "member.h" #include "recoverd.h" -#include "ast.h" #include "dir.h" #include "lowcomms.h" #include "config.h" @@ -24,6 +23,7 @@ #include "recover.h" #include "requestqueue.h" #include "user.h" +#include "ast.h" static int ls_count; static struct mutex ls_lock; @@ -359,17 +359,10 @@ static int threads_start(void) { int error; - /* Thread which process lock requests for all lockspace's */ - error = dlm_astd_start(); - if (error) { - log_print("cannot start dlm_astd thread %d", error); - goto fail; - } - error = dlm_scand_start(); if (error) { log_print("cannot start dlm_scand thread %d", error); - goto astd_fail; + goto fail; } /* Thread for sending/receiving messages for all lockspace's */ @@ -383,8 +376,6 @@ static int threads_start(void) scand_fail: dlm_scand_stop(); - astd_fail: - dlm_astd_stop(); fail: return error; } @@ -393,7 +384,6 @@ static void threads_stop(void) { dlm_scand_stop(); dlm_lowcomms_stop(); - dlm_astd_stop(); } static int new_lockspace(const char *name, int namelen, void **lockspace, @@ -463,7 +453,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; - ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); + ls->ls_rsbtbl = vmalloc(sizeof(struct dlm_rsbtable) * size); if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { @@ -472,22 +462,13 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, spin_lock_init(&ls->ls_rsbtbl[i].lock); } - size = dlm_config.ci_lkbtbl_size; - ls->ls_lkbtbl_size = size; - - ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); - if (!ls->ls_lkbtbl) - goto out_rsbfree; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list); - rwlock_init(&ls->ls_lkbtbl[i].lock); - ls->ls_lkbtbl[i].counter = 1; - } + idr_init(&ls->ls_lkbidr); + spin_lock_init(&ls->ls_lkbidr_spin); size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; - ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); + ls->ls_dirtbl = vmalloc(sizeof(struct dlm_dirtable) * size); if (!ls->ls_dirtbl) goto out_lkbfree; for (i = 0; i < size; i++) { @@ -502,6 +483,9 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); + INIT_LIST_HEAD(&ls->ls_new_rsb); + spin_lock_init(&ls->ls_new_rsb_spin); + INIT_LIST_HEAD(&ls->ls_nodes); INIT_LIST_HEAD(&ls->ls_nodes_gone); ls->ls_num_nodes = 0; @@ -520,6 +504,9 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, init_completion(&ls->ls_members_done); ls->ls_members_result = -1; + mutex_init(&ls->ls_cb_mutex); + INIT_LIST_HEAD(&ls->ls_cb_delay); + ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); spin_lock_init(&ls->ls_recover_lock); @@ -553,18 +540,26 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, list_add(&ls->ls_list, &lslist); spin_unlock(&lslist_lock); + if (flags & DLM_LSFL_FS) { + error = dlm_callback_start(ls); + if (error) { + log_error(ls, "can't start dlm_callback %d", error); + goto out_delist; + } + } + /* needs to find ls in lslist */ error = dlm_recoverd_start(ls); if (error) { log_error(ls, "can't start dlm_recoverd %d", error); - goto out_delist; + goto out_callback; } ls->ls_kobj.kset = dlm_kset; error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, "%s", ls->ls_name); if (error) - goto out_stop; + goto out_recoverd; kobject_uevent(&ls->ls_kobj, KOBJ_ADD); /* let kobject handle freeing of ls if there's an error */ @@ -578,7 +573,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, error = do_uevent(ls, 1); if (error) - goto out_stop; + goto out_recoverd; wait_for_completion(&ls->ls_members_done); error = ls->ls_members_result; @@ -595,19 +590,20 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, do_uevent(ls, 0); dlm_clear_members(ls); kfree(ls->ls_node_array); - out_stop: + out_recoverd: dlm_recoverd_stop(ls); + out_callback: + dlm_callback_stop(ls); out_delist: spin_lock(&lslist_lock); list_del(&ls->ls_list); spin_unlock(&lslist_lock); kfree(ls->ls_recover_buf); out_dirfree: - kfree(ls->ls_dirtbl); + vfree(ls->ls_dirtbl); out_lkbfree: - kfree(ls->ls_lkbtbl); - out_rsbfree: - kfree(ls->ls_rsbtbl); + idr_destroy(&ls->ls_lkbidr); + vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) kobject_put(&ls->ls_kobj); @@ -641,50 +637,64 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace, return error; } -/* Return 1 if the lockspace still has active remote locks, - * 2 if the lockspace still has active local locks. - */ -static int lockspace_busy(struct dlm_ls *ls) -{ - int i, lkb_found = 0; - struct dlm_lkb *lkb; - - /* NOTE: We check the lockidtbl here rather than the resource table. - This is because there may be LKBs queued as ASTs that have been - unlinked from their RSBs and are pending deletion once the AST has - been delivered */ - - for (i = 0; i < ls->ls_lkbtbl_size; i++) { - read_lock(&ls->ls_lkbtbl[i].lock); - if (!list_empty(&ls->ls_lkbtbl[i].list)) { - lkb_found = 1; - list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list, - lkb_idtbl_list) { - if (!lkb->lkb_nodeid) { - read_unlock(&ls->ls_lkbtbl[i].lock); - return 2; - } - } - } - read_unlock(&ls->ls_lkbtbl[i].lock); +static int lkb_idr_is_local(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (!lkb->lkb_nodeid) + return 1; + return 0; +} + +static int lkb_idr_is_any(int id, void *p, void *data) +{ + return 1; +} + +static int lkb_idr_free(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) + dlm_free_lvb(lkb->lkb_lvbptr); + + dlm_free_lkb(lkb); + return 0; +} + +/* NOTE: We check the lkbidr here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been unlinked + from their RSBs and are pending deletion once the AST has been delivered */ + +static int lockspace_busy(struct dlm_ls *ls, int force) +{ + int rv; + + spin_lock(&ls->ls_lkbidr_spin); + if (force == 0) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls); + } else if (force == 1) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls); + } else { + rv = 0; } - return lkb_found; + spin_unlock(&ls->ls_lkbidr_spin); + return rv; } static int release_lockspace(struct dlm_ls *ls, int force) { - struct dlm_lkb *lkb; struct dlm_rsb *rsb; struct list_head *head; int i, busy, rv; - busy = lockspace_busy(ls); + busy = lockspace_busy(ls, force); spin_lock(&lslist_lock); if (ls->ls_create_count == 1) { - if (busy > force) + if (busy) { rv = -EBUSY; - else { + } else { /* remove_lockspace takes ls off lslist */ ls->ls_create_count = 0; rv = 0; @@ -708,12 +718,12 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_recoverd_stop(ls); + dlm_callback_stop(ls); + remove_lockspace(ls); dlm_delete_debug_file(ls); - dlm_astd_suspend(); - kfree(ls->ls_recover_buf); /* @@ -721,31 +731,15 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ dlm_dir_clear(ls); - kfree(ls->ls_dirtbl); + vfree(ls->ls_dirtbl); /* - * Free all lkb's on lkbtbl[] lists. + * Free all lkb's in idr */ - for (i = 0; i < ls->ls_lkbtbl_size; i++) { - head = &ls->ls_lkbtbl[i].list; - while (!list_empty(head)) { - lkb = list_entry(head->next, struct dlm_lkb, - lkb_idtbl_list); - - list_del(&lkb->lkb_idtbl_list); - - dlm_del_ast(lkb); - - if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) - dlm_free_lvb(lkb->lkb_lvbptr); - - dlm_free_lkb(lkb); - } - } - dlm_astd_resume(); - - kfree(ls->ls_lkbtbl); + idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); + idr_remove_all(&ls->ls_lkbidr); + idr_destroy(&ls->ls_lkbidr); /* * Free all rsb's on rsbtbl[] lists @@ -770,7 +764,14 @@ static int release_lockspace(struct dlm_ls *ls, int force) } } - kfree(ls->ls_rsbtbl); + vfree(ls->ls_rsbtbl); + + while (!list_empty(&ls->ls_new_rsb)) { + rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } /* * Free structures on any other lists diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5e2c71f05e46..990626e7da80 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -512,12 +512,10 @@ static void process_sctp_notification(struct connection *con, } make_sockaddr(&prim.ssp_addr, 0, &addr_len); if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { - int i; unsigned char *b=(unsigned char *)&prim.ssp_addr; log_print("reject connect from unknown addr"); - for (i=0; isock_mutex); return -1; diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 8e0d00db004f..da64df7576e1 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -16,6 +16,7 @@ #include "memory.h" static struct kmem_cache *lkb_cache; +static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) @@ -26,6 +27,14 @@ int __init dlm_memory_init(void) __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) ret = -ENOMEM; + + rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), + __alignof__(struct dlm_rsb), 0, NULL); + if (!rsb_cache) { + kmem_cache_destroy(lkb_cache); + ret = -ENOMEM; + } + return ret; } @@ -33,6 +42,8 @@ void dlm_memory_exit(void) { if (lkb_cache) kmem_cache_destroy(lkb_cache); + if (rsb_cache) + kmem_cache_destroy(rsb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -48,16 +59,11 @@ void dlm_free_lvb(char *p) kfree(p); } -/* FIXME: have some minimal space built-in to rsb for the name and - kmalloc a separate name if needed, like dentries are done */ - -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls) { struct dlm_rsb *r; - DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - - r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); + r = kmem_cache_zalloc(rsb_cache, GFP_NOFS); return r; } @@ -65,7 +71,7 @@ void dlm_free_rsb(struct dlm_rsb *r) { if (r->res_lvbptr) dlm_free_lvb(r->res_lvbptr); - kfree(r); + kmem_cache_free(rsb_cache, r); } struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 485fb29143bd..177c11cbb0a6 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -16,7 +16,7 @@ int dlm_memory_init(void); void dlm_memory_exit(void); -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls); void dlm_free_rsb(struct dlm_rsb *r); struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index fd677c8c3d3b..774da3cf92c6 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -58,13 +58,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) mutex_lock(&ls->ls_recoverd_active); - /* - * Suspending and resuming dlm_astd ensures that no lkb's from this ls - * will be processed by dlm_astd during recovery. - */ - - dlm_astd_suspend(); - dlm_astd_resume(); + dlm_callback_suspend(ls); /* * Free non-master tossed rsb's. Master rsb's are kept on toss @@ -202,6 +196,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_adjust_timeouts(ls); + dlm_callback_resume(ls); + error = enable_locking(ls, rv->seq); if (error) { log_debug(ls, "enable_locking failed %d", error); @@ -222,8 +218,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_grant_after_purge(ls); - dlm_astd_wake(); - log_debug(ls, "recover %llx done: %u ms", (unsigned long long)rv->seq, jiffies_to_msecs(jiffies - start)); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index e96bf3e9be88..d8ea60756403 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -213,9 +213,9 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, goto out; } - if (list_empty(&lkb->lkb_astqueue)) { + if (list_empty(&lkb->lkb_cb_list)) { kref_get(&lkb->lkb_ref); - list_add_tail(&lkb->lkb_astqueue, &proc->asts); + list_add_tail(&lkb->lkb_cb_list, &proc->asts); wake_up_interruptible(&proc->wait); } spin_unlock(&proc->asts_spin); @@ -832,24 +832,24 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } /* if we empty lkb_callbacks, we don't want to unlock the spinlock - without removing lkb_astqueue; so empty lkb_astqueue is always + without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ - lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); + lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); if (rv < 0) { /* this shouldn't happen; lkb should have been removed from list when resid was zero */ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); goto try_another; } if (!resid) - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); if (cb.flags & DLM_CB_SKIP) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 2f343b4d7a7d..3f7a59bfa7ad 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -976,16 +976,12 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; - while (next <= (loff_t)-1 && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE) - ) { + do { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - - ASSERTCMP(page_index, >=, next); - next = page_index + 1; - + next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -993,7 +989,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } + } while (++next); _leave(""); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e65493a8ac00..42e477f31223 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -854,11 +854,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) { - if (metadata) - __gfs2_free_meta(ip, bstart, blen); - else - __gfs2_free_data(ip, bstart, blen); - + __gfs2_free_blocks(ip, bstart, blen, metadata); btotal += blen; } @@ -870,11 +866,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) { - if (metadata) - __gfs2_free_meta(ip, bstart, blen); - else - __gfs2_free_data(ip, bstart, blen); - + __gfs2_free_blocks(ip, bstart, blen, metadata); btotal += blen; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 091ee4779538..1cc2f8ec52a2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -339,6 +339,67 @@ fail: return (copied) ? copied : error; } +/** + * gfs2_dir_get_hash_table - Get pointer to the dir hash table + * @ip: The inode in question + * + * Returns: The hash table or an error + */ + +static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + int ret; + u32 hsize; + __be64 *hc; + + BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); + + hc = ip->i_hash_cache; + if (hc) + return hc; + + hsize = 1 << ip->i_depth; + hsize *= sizeof(__be64); + if (hsize != i_size_read(&ip->i_inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + hc = kmalloc(hsize, GFP_NOFS); + ret = -ENOMEM; + if (hc == NULL) + return ERR_PTR(-ENOMEM); + + ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); + if (ret < 0) { + kfree(hc); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + if (ip->i_hash_cache) + kfree(hc); + else + ip->i_hash_cache = hc; + spin_unlock(&inode->i_lock); + + return ip->i_hash_cache; +} + +/** + * gfs2_dir_hash_inval - Invalidate dir hash + * @ip: The directory inode + * + * Must be called with an exclusive glock, or during glock invalidation. + */ +void gfs2_dir_hash_inval(struct gfs2_inode *ip) +{ + __be64 *hc = ip->i_hash_cache; + ip->i_hash_cache = NULL; + kfree(hc); +} + static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) { return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; @@ -686,17 +747,12 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { - __be64 leaf_no; - int error; - - error = gfs2_dir_read_data(dip, (char *)&leaf_no, - index * sizeof(__be64), - sizeof(__be64), 0); - if (error != sizeof(u64)) - return (error < 0) ? error : -EIO; - - *leaf_out = be64_to_cpu(leaf_no); + __be64 *hash; + hash = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hash)) + return PTR_ERR(hash); + *leaf_out = be64_to_cpu(*(hash + index)); return 0; } @@ -966,6 +1022,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); + gfs2_dir_hash_inval(dip); + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), half_len * sizeof(u64)); if (error != half_len * sizeof(u64)) { @@ -1052,70 +1110,54 @@ fail_brelse: static int dir_double_exhash(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *dibh; u32 hsize; - u64 *buf; - u64 *from, *to; - u64 block; - u64 disksize = i_size_read(&dip->i_inode); + u32 hsize_bytes; + __be64 *hc; + __be64 *hc2, *h; int x; int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != disksize) { - gfs2_consist_inode(dip); - return -EIO; - } + hsize_bytes = hsize * sizeof(__be64); - /* Allocate both the "from" and "to" buffers in one big chunk */ + hc = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hc)) + return PTR_ERR(hc); - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); - if (!buf) + h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + if (!hc2) return -ENOMEM; - for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { - error = gfs2_dir_read_data(dip, (char *)buf, - block * sdp->sd_hash_bsize, - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } - - from = buf; - to = (u64 *)((char *)buf + sdp->sd_hash_bsize); - - for (x = sdp->sd_hash_ptrs; x--; from++) { - *to++ = *from; /* No endianess worries */ - *to++ = *from; - } + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_kfree; - error = gfs2_dir_write_data(dip, - (char *)buf + sdp->sd_hash_bsize, - block * sdp->sd_sb.sb_bsize, - sdp->sd_sb.sb_bsize); - if (error != sdp->sd_sb.sb_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } + for (x = 0; x < hsize; x++) { + *h++ = *hc; + *h++ = *hc; + hc++; } - kfree(buf); - - error = gfs2_meta_inode_buffer(dip, &dibh); - if (!gfs2_assert_withdraw(sdp, !error)) { - dip->i_depth++; - gfs2_dinode_out(dip, dibh->b_data); - brelse(dibh); - } + error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2); + if (error != (hsize_bytes * 2)) + goto fail; - return error; + gfs2_dir_hash_inval(dip); + dip->i_hash_cache = hc2; + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + return 0; fail: - kfree(buf); + /* Replace original hash table & size */ + gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes); + i_size_write(&dip->i_inode, hsize_bytes); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); +out_kfree: + kfree(hc2); return error; } @@ -1348,6 +1390,7 @@ out: return error; } + /** * dir_e_read - Reads the entries from a directory into a filldir buffer * @dip: dinode pointer @@ -1362,9 +1405,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); u32 hsize, len = 0; - u32 ht_offset, lp_offset, ht_offset_cur = -1; u32 hash, index; __be64 *lp; int copied = 0; @@ -1372,37 +1413,17 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != i_size_read(inode)) { - gfs2_consist_inode(dip); - return -EIO; - } - hash = gfs2_dir_offset2hash(*offset); index = hash >> (32 - dip->i_depth); - lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); - if (!lp) - return -ENOMEM; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, &copied, &depth, - be64_to_cpu(lp[lp_offset])); + be64_to_cpu(lp[index])); if (error) break; @@ -1410,8 +1431,6 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, index = (index & ~(len - 1)) + len; } -out: - kfree(lp); if (error > 0) error = 0; return error; @@ -1914,43 +1933,22 @@ out: int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *bh; struct gfs2_leaf *leaf; u32 hsize, len; - u32 ht_offset, lp_offset, ht_offset_cur = -1; u32 index = 0, next_index; __be64 *lp; u64 leaf_no; int error = 0, last; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { - gfs2_consist_inode(dip); - return -EIO; - } - lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); - if (!lp) - return -ENOMEM; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - - leaf_no = be64_to_cpu(lp[lp_offset]); + leaf_no = be64_to_cpu(lp[index]); if (leaf_no) { error = get_leaf(dip, leaf_no, &bh); if (error) @@ -1976,7 +1974,6 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) } out: - kfree(lp); return error; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index e686af11becd..ff5772fbf024 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -35,6 +35,7 @@ extern int gfs2_diradd_alloc_required(struct inode *dir, const struct qstr *filename); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); +extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a9f5cbe45cd9..bc2590ef5fc1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -174,7 +174,9 @@ void gfs2_set_inode_flags(struct inode *inode) struct gfs2_inode *ip = GFS2_I(inode); unsigned int flags = inode->i_flags; - flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; if (ip->i_diskflags & GFS2_DIF_APPENDONLY) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1c1336e7b3b2..88e8a23d0026 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -409,6 +409,10 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) if (held1 && held2 && list_empty(&gl->gl_holders)) clear_bit(GLF_QUEUED, &gl->gl_flags); + if (new_state != gl->gl_target) + /* shorten our minimum hold time */ + gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, + GL_GLOCK_MIN_HOLD); gl->gl_state = new_state; gl->gl_tchange = jiffies; } @@ -668,7 +672,7 @@ static void glock_work_func(struct work_struct *work) gl->gl_demote_state != LM_ST_EXCLUSIVE) { unsigned long holdtime, now = jiffies; - holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; @@ -679,9 +683,14 @@ static void glock_work_func(struct work_struct *work) } run_queue(gl, 0); spin_unlock(&gl->gl_spin); - if (!delay || - queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + if (!delay) gfs2_glock_put(gl); + else { + if (gl->gl_name.ln_type != LM_TYPE_INODE) + delay = 0; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); + } if (drop_ref) gfs2_glock_put(gl); } @@ -743,6 +752,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_sbd = sdp; + gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); INIT_WORK(&gl->gl_delete, delete_work_func); @@ -855,8 +865,15 @@ static int gfs2_glock_demote_wait(void *word) static void wait_on_holder(struct gfs2_holder *gh) { + unsigned long time1 = jiffies; + might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ + /* Lengthen the minimum hold time. */ + gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + + GL_GLOCK_HOLD_INCR, + GL_GLOCK_MAX_HOLD); } static void wait_on_demote(struct gfs2_glock *gl) @@ -1093,8 +1110,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_hold(gl); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } @@ -1273,12 +1291,13 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) unsigned long now = jiffies; gfs2_glock_hold(gl); - holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; - if (test_bit(GLF_QUEUED, &gl->gl_flags)) { + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + delay = gl->gl_hold_time; } spin_lock(&gl->gl_spin); @@ -1667,7 +1686,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; - gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n", + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n", state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, @@ -1676,7 +1695,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), - atomic_read(&gl->gl_ref)); + atomic_read(&gl->gl_ref), gl->gl_hold_time); list_for_each_entry(gh, &gl->gl_holders, gh_list) { error = dump_holder(seq, gh); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6b2f757b9281..66707118af25 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -113,6 +113,12 @@ enum { #define GLR_TRYFAILED 13 +#define GL_GLOCK_MAX_HOLD (long)(HZ / 5) +#define GL_GLOCK_DFT_HOLD (long)(HZ / 5) +#define GL_GLOCK_MIN_HOLD (long)(10) +#define GL_GLOCK_HOLD_INCR (long)(HZ / 20) +#define GL_GLOCK_HOLD_DECR (long)(HZ / 40) + struct lm_lockops { const char *lm_proto_name; int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 2cca29316bd6..da21ecaafcc2 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -26,6 +26,7 @@ #include "rgrp.h" #include "util.h" #include "trans.h" +#include "dir.h" /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL @@ -218,6 +219,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (ip) { set_bit(GIF_INVALID, &ip->i_flags); forget_all_cached_acls(&ip->i_inode); + gfs2_dir_hash_inval(ip); } } @@ -316,6 +318,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_generation = be64_to_cpu(str->di_generation); ip->i_diskflags = be32_to_cpu(str->di_flags); + ip->i_eattr = be64_to_cpu(str->di_eattr); + /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ gfs2_set_inode_flags(&ip->i_inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) @@ -328,7 +332,6 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); - ip->i_eattr = be64_to_cpu(str->di_eattr); if (S_ISREG(ip->i_inode.i_mode)) gfs2_set_aops(&ip->i_inode); @@ -549,7 +552,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_min_hold_time = HZ / 5, .go_flags = GLOF_ASPACE, }; @@ -560,7 +562,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_min_hold_time = HZ / 5, .go_flags = GLOF_ASPACE, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 81206e70cbf6..892ac37de8ae 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -163,7 +163,6 @@ struct gfs2_glock_operations { int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback) (struct gfs2_glock *gl); const int go_type; - const unsigned long go_min_hold_time; const unsigned long go_flags; #define GLOF_ASPACE 1 }; @@ -221,6 +220,7 @@ struct gfs2_glock { unsigned int gl_hash; unsigned long gl_demote_time; /* time of first demote request */ + long gl_hold_time; struct list_head gl_holders; const struct gfs2_glock_operations *gl_ops; @@ -285,6 +285,7 @@ struct gfs2_inode { u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; + __be64 *i_hash_cache; u32 i_entries; u32 i_diskflags; u8 i_height; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c2b34cd2abe0..29e1ace7953d 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -41,6 +41,7 @@ static void gfs2_init_inode_once(void *foo) init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_alloc = NULL; + ip->i_hash_cache = NULL; } static void gfs2_init_glock_once(void *foo) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 2a77071fb7b6..516516e0c2a2 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1094,6 +1094,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + sb->s_flags |= MS_NOSEC; sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9b780df3fd54..7f8af1eb02de 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1607,14 +1607,15 @@ rgrp_error: } /** - * gfs2_free_data - free a contiguous run of data block(s) + * __gfs2_free_blocks - free a contiguous run of block(s) * @ip: the inode these blocks are being freed from * @bstart: first block of a run of contiguous blocks * @blen: the length of the block run + * @meta: 1 if the blocks represent metadata * */ -void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1631,53 +1632,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_trans_add_rg(rgd); /* Directories keep their data in the metadata address space */ - if (ip->i_depth) + if (meta || ip->i_depth) gfs2_meta_wipe(ip, bstart, blen); } -/** - * gfs2_free_data - free a contiguous run of data block(s) - * @ip: the inode these blocks are being freed from - * @bstart: first block of a run of contiguous blocks - * @blen: the length of the block run - * - */ - -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - - __gfs2_free_data(ip, bstart, blen); - gfs2_statfs_change(sdp, 0, +blen, 0); - gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); -} - -/** - * gfs2_free_meta - free a contiguous run of data block(s) - * @ip: the inode these blocks are being freed from - * @bstart: first block of a run of contiguous blocks - * @blen: the length of the block run - * - */ - -void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd; - - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); - if (!rgd) - return; - trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); - rgd->rd_free += blen; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - - gfs2_trans_add_rg(rgd); - gfs2_meta_wipe(ip, bstart, blen); -} - /** * gfs2_free_meta - free a contiguous run of data block(s) * @ip: the inode these blocks are being freed from @@ -1690,7 +1648,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - __gfs2_free_meta(ip, bstart, blen); + __gfs2_free_blocks(ip, bstart, blen, 1); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a80e3034ac47..d253f9a8c70e 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -52,9 +52,7 @@ extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); -extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fb0edf735483..b7beadd9ba4c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1533,7 +1533,7 @@ out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); - + gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2312de34bd42..2a734cfccc92 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -43,6 +43,10 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; + if (recoff > node->tree->node_size - 2) { + printk(KERN_ERR "hfs: recoff %d too large\n", recoff); + return 0; + } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index b4ba1b319333..4dfbfec357e8 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -212,7 +212,9 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, @@ -269,7 +271,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; if (!str) { int len; @@ -347,12 +351,14 @@ int hfsplus_rename_cat(u32 cnid, struct hfs_find_data src_fd, dst_fd; hfsplus_cat_entry entry; int entry_size, type; - int err = 0; + int err; dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4df5059c25da..25b2443a004c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -38,7 +38,9 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; dentry->d_fsdata = NULL; - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return ERR_PTR(err); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); @@ -132,7 +134,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1991a2a08e0..5849e3ef35cc 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -119,22 +119,31 @@ static void __hfsplus_ext_write_extent(struct inode *inode, set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); } -static void hfsplus_ext_write_extent_locked(struct inode *inode) +static int hfsplus_ext_write_extent_locked(struct inode *inode) { + int res; + if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; - hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } + return 0; } -void hfsplus_ext_write_extent(struct inode *inode) +int hfsplus_ext_write_extent(struct inode *inode) { + int res; + mutex_lock(&HFSPLUS_I(inode)->extents_lock); - hfsplus_ext_write_extent_locked(inode); + res = hfsplus_ext_write_extent_locked(inode); mutex_unlock(&HFSPLUS_I(inode)->extents_lock); + + return res; } static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, @@ -194,9 +203,11 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block) block < hip->cached_start + hip->cached_blocks) return 0; - hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); - res = __hfsplus_ext_cache_extent(&fd, inode, block); - hfs_find_exit(&fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfsplus_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } return res; } @@ -209,6 +220,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res = -EIO; u32 ablock, dblock, mask; + sector_t sector; int was_dirty = 0; int shift; @@ -255,10 +267,12 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); + mask = (1 << sbi->fs_shift) - 1; - map_bh(bh_result, sb, - (dblock << sbi->fs_shift) + sbi->blockoffset + - (iblock & mask)); + sector = ((sector_t)dblock << sbi->fs_shift) + + sbi->blockoffset + (iblock & mask); + map_bh(bh_result, sb, sector); + if (create) { set_buffer_new(bh_result); hip->phys_size += sb->s_blocksize; @@ -371,7 +385,9 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, if (total_blocks == blocks) return 0; - hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) + return res; do { res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, total_blocks, type); @@ -469,7 +485,9 @@ out: insert_extent: dprint(DBG_EXTENT, "insert new extent\n"); - hfsplus_ext_write_extent_locked(inode); + res = hfsplus_ext_write_extent_locked(inode); + if (res) + goto out; memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); hip->cached_extents[0].start_block = cpu_to_be32(start); @@ -500,7 +518,6 @@ void hfsplus_file_truncate(struct inode *inode) struct page *page; void *fsdata; u32 size = inode->i_size; - int res; res = pagecache_write_begin(NULL, mapping, size, 0, AOP_FLAG_UNINTERRUPTIBLE, @@ -523,7 +540,12 @@ void hfsplus_file_truncate(struct inode *inode) goto out; mutex_lock(&hip->extents_lock); - hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&hip->extents_lock); + /* XXX: We lack error handling of hfsplus_file_truncate() */ + return; + } while (1) { if (alloc_cnt == hip->first_blocks) { hfsplus_free_extents(sb, hip->first_extents, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d6857523336d..81dfd1e495e3 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -110,7 +111,9 @@ struct hfsplus_vh; struct hfs_btree; struct hfsplus_sb_info { + void *s_vhdr_buf; struct hfsplus_vh *s_vhdr; + void *s_backup_vhdr_buf; struct hfsplus_vh *s_backup_vhdr; struct hfs_btree *ext_tree; struct hfs_btree *cat_tree; @@ -258,6 +261,15 @@ struct hfsplus_readdir_data { struct hfsplus_cat_key key; }; +/* + * Find minimum acceptible I/O size for an hfsplus sb. + */ +static inline unsigned short hfsplus_min_io_size(struct super_block *sb) +{ + return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + HFSPLUS_SECTOR_SIZE); +} + #define hfs_btree_open hfsplus_btree_open #define hfs_btree_close hfsplus_btree_close #define hfs_btree_write hfsplus_btree_write @@ -374,7 +386,7 @@ extern const struct file_operations hfsplus_dir_operations; /* extents.c */ int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -void hfsplus_ext_write_extent(struct inode *); +int hfsplus_ext_write_extent(struct inode *); int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); @@ -436,8 +448,8 @@ int hfsplus_compare_dentry(const struct dentry *parent, /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); int hfs_part_find(struct super_block *, sector_t *, sector_t *); -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b248a6cfcad9..010cd363d085 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -195,11 +195,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, hip->flags = 0; set_bit(HFSPLUS_I_RSRC, &hip->flags); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); - err = hfsplus_find_cat(sb, dir->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (!err) { + err = hfsplus_find_cat(sb, dir->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } if (err) { iput(inode); return ERR_PTR(err); diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 40ad88c12c64..eb355d81e279 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -88,11 +88,12 @@ static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, return -ENOENT; } -static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, - sector_t *part_start, sector_t *part_size) +static int hfs_parse_new_pmap(struct super_block *sb, void *buf, + struct new_pmap *pm, sector_t *part_start, sector_t *part_size) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); int size = be32_to_cpu(pm->pmMapBlkCnt); + int buf_size = hfsplus_min_io_size(sb); int res; int i = 0; @@ -107,11 +108,14 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, if (++i >= size) return -ENOENT; - res = hfsplus_submit_bio(sb->s_bdev, - *part_start + HFS_PMAP_BLK + i, - pm, READ); - if (res) - return res; + pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); + if ((u8 *)pm - (u8 *)buf >= buf_size) { + res = hfsplus_submit_bio(sb, + *part_start + HFS_PMAP_BLK + i, + buf, (void **)&pm, READ); + if (res) + return res; + } } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); return -ENOENT; @@ -124,15 +128,15 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { - void *data; + void *buf, *data; int res; - data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!data) + buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!buf) return -ENOMEM; - res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, - data, READ); + res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, + buf, &data, READ); if (res) goto out; @@ -141,13 +145,13 @@ int hfs_part_find(struct super_block *sb, res = hfs_parse_old_pmap(sb, data, part_start, part_size); break; case HFS_NEW_PMAP_MAGIC: - res = hfs_parse_new_pmap(sb, data, part_start, part_size); + res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); break; default: res = -ENOENT; break; } out: - kfree(data); + kfree(buf); return res; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 84a47b709f51..c106ca22e812 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -73,11 +73,13 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { - hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (!err) { + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } } else { err = hfsplus_system_read_inode(inode); } @@ -133,9 +135,13 @@ static int hfsplus_system_write_inode(struct inode *inode) static int hfsplus_write_inode(struct inode *inode, struct writeback_control *wbc) { + int err; + dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); - hfsplus_ext_write_extent(inode); + err = hfsplus_ext_write_extent(inode); + if (err) + return err; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) @@ -197,17 +203,17 @@ int hfsplus_sync_fs(struct super_block *sb, int wait) write_backup = 1; } - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, WRITE_SYNC); + sbi->s_vhdr_buf, NULL, WRITE_SYNC); if (!error) error = error2; if (!write_backup) goto out; - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr, WRITE_SYNC); + sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); if (!error) error2 = error; out: @@ -251,8 +257,8 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); iput(sbi->hidden_dir); - kfree(sbi->s_vhdr); - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; @@ -393,6 +399,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->rsrc_clump_blocks) sbi->rsrc_clump_blocks = 1; + err = generic_check_addressable(sbi->alloc_blksz_shift, + sbi->total_blocks); + if (err) { + printk(KERN_ERR "hfs: filesystem size too large.\n"); + goto out_free_vhdr; + } + /* Set up operations so we can load metadata */ sb->s_op = &hfsplus_sops; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,6 +430,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY; } + err = -EINVAL; + /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { @@ -447,7 +462,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; - hfs_find_init(sbi->cat_tree, &fd); + err = hfs_find_init(sbi->cat_tree, &fd); + if (err) + goto out_put_root; hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index a3f0bfcc881e..a32998f29f0b 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -142,7 +142,11 @@ int hfsplus_uni2asc(struct super_block *sb, /* search for single decomposed char */ if (likely(compose)) ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0); - if (ce1 && (cc = ce1[0])) { + if (ce1) + cc = ce1[0]; + else + cc = 0; + if (cc) { /* start of a possibly decomposed Hangul char */ if (cc != 0xffff) goto done; @@ -209,7 +213,8 @@ int hfsplus_uni2asc(struct super_block *sb, i++; ce2 = ce1; } - if ((cc = ce2[0])) { + cc = ce2[0]; + if (cc) { ip += i; ustrlen -= i; goto done; @@ -301,7 +306,11 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { size = asc2unichar(sb, astr, len, &c); - if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { if (outlen + dsize > HFSPLUS_MAX_STRLEN) break; do { @@ -346,15 +355,23 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, astr += size; len -= size; - if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { do { c2 = *dstr++; - if (!casefold || (c2 = case_fold(c2))) + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) hash = partial_name_hash(c2, hash); } while (--dsize > 0); } else { c2 = c; - if (!casefold || (c2 = case_fold(c2))) + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) hash = partial_name_hash(c2, hash); } } @@ -422,12 +439,14 @@ int hfsplus_compare_dentry(const struct dentry *parent, c1 = *dstr1; c2 = *dstr2; if (casefold) { - if (!(c1 = case_fold(c1))) { + c1 = case_fold(c1); + if (!c1) { dstr1++; dsize1--; continue; } - if (!(c2 = case_fold(c2))) { + c2 = case_fold(c2); + if (!c2) { dstr2++; dsize2--; continue; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 4ac88ff79aa6..10e515a0d452 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -31,25 +31,67 @@ static void hfsplus_end_io_sync(struct bio *bio, int err) complete(bio->bi_private); } -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw) +/* + * hfsplus_submit_bio - Perfrom block I/O + * @sb: super block of volume for I/O + * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes + * @buf: buffer for I/O + * @data: output pointer for location of requested data + * @rw: direction of I/O + * + * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than + * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads + * @data will return a pointer to the start of the requested sector, + * which may not be the same location as @buf. + * + * If @sector is not aligned to the bdev logical block size it will + * be rounded down. For writes this means that @buf should contain data + * that starts at the rounded-down address. As long as the data was + * read using hfsplus_submit_bio() and the same buffer is used things + * will work correctly. + */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw) { DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; int ret = 0; + unsigned int io_size; + loff_t start; + int offset; + + /* + * Align sector to hardware sector size and find offset. We + * assume that io_size is a power of two, which _should_ + * be true. + */ + io_size = hfsplus_min_io_size(sb); + start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + offset = start & (io_size - 1); + sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; - bio->bi_bdev = bdev; + bio->bi_bdev = sb->s_bdev; bio->bi_end_io = hfsplus_end_io_sync; bio->bi_private = &wait; - /* - * We always submit one sector at a time, so bio_add_page must not fail. - */ - if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE, - offset_in_page(data)) != HFSPLUS_SECTOR_SIZE) - BUG(); + if (!(rw & WRITE) && data) + *data = (u8 *)buf + offset; + + while (io_size > 0) { + unsigned int page_offset = offset_in_page(buf); + unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, + io_size); + + ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); + if (ret != len) { + ret = -EIO; + goto out; + } + io_size -= len; + buf = (u8 *)buf + len; + } submit_bio(rw, bio); wait_for_completion(&wait); @@ -57,8 +99,9 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, if (!bio_flagged(bio, BIO_UPTODATE)) ret = -EIO; +out: bio_put(bio); - return ret; + return ret < 0 ? ret : 0; } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) @@ -141,23 +184,19 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; - if ((u64)part_start + part_size > 0x100000000ULL) { - pr_err("hfs: volumes larger than 2TB are not supported yet\n"); - goto out; - } error = -ENOMEM; - sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_vhdr) + sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_vhdr_buf) goto out; - sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_backup_vhdr) + sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_backup_vhdr_buf) goto out_free_vhdr; reread: - error = hfsplus_submit_bio(sb->s_bdev, - part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, + READ); if (error) goto out_free_backup_vhdr; @@ -172,8 +211,9 @@ reread: if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) goto out_free_backup_vhdr; wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; - part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; - part_size = wd.embed_count * wd.ablk_size; + part_start += (sector_t)wd.ablk_start + + (sector_t)wd.embed_start * wd.ablk_size; + part_size = (sector_t)wd.embed_count * wd.ablk_size; goto reread; default: /* @@ -186,9 +226,9 @@ reread: goto reread; } - error = hfsplus_submit_bio(sb->s_bdev, - part_start + part_size - 2, - sbi->s_backup_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + part_size - 2, + sbi->s_backup_vhdr_buf, + (void **)&sbi->s_backup_vhdr, READ); if (error) goto out_free_backup_vhdr; diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 87cd0ead8633..fb3b5c813a30 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -78,7 +78,7 @@ static int nothing_to_commit(struct ubifs_info *c) * If the root TNC node is dirty, we definitely have something to * commit. */ - if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode)) return 0; /* @@ -418,7 +418,7 @@ int ubifs_run_commit(struct ubifs_info *c) spin_lock(&c->cs_lock); if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out; } @@ -444,7 +444,7 @@ int ubifs_run_commit(struct ubifs_info *c) * re-check it. */ if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out_cmt_unlock; } @@ -576,7 +576,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) struct idx_node *i; size_t sz; - if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) + if (!dbg_is_chk_index(c)) return 0; INIT_LIST_HEAD(&list); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bb2bcef0de9..eef109a1a927 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -27,13 +27,12 @@ * various local functions of those subsystems. */ -#define UBIFS_DBG_PRESERVE_UBI - -#include "ubifs.h" #include -#include #include #include +#include +#include +#include "ubifs.h" #ifdef CONFIG_UBIFS_FS_DEBUG @@ -42,15 +41,6 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_chk_flags; -unsigned int ubifs_tst_flags; - -module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); -module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); - -MODULE_PARM_DESC(debug_chks, "Debug check flags"); -MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); - static const char *get_key_fmt(int fmt) { switch (fmt) { @@ -91,6 +81,28 @@ static const char *get_key_type(int type) } } +static const char *get_dent_type(int type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return "file"; + case UBIFS_ITYPE_DIR: + return "dir"; + case UBIFS_ITYPE_LNK: + return "symlink"; + case UBIFS_ITYPE_BLK: + return "blkdev"; + case UBIFS_ITYPE_CHR: + return "char dev"; + case UBIFS_ITYPE_FIFO: + return "fifo"; + case UBIFS_ITYPE_SOCK: + return "socket"; + default: + return "unknown/invalid type"; + } +} + static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer) { @@ -234,9 +246,13 @@ static void dump_ch(const struct ubifs_ch *ch) printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); } -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); + struct qstr nm = { .name = NULL }; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + int count = 2; printk(KERN_DEBUG "Dump in-memory inode:"); printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); @@ -270,6 +286,32 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); + + if (!S_ISDIR(inode->i_mode)) + return; + + printk(KERN_DEBUG "List of directory entries:\n"); + ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); + + lowest_dent_key(c, &key, inode->i_ino); + while (1) { + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + if (PTR_ERR(dent) != -ENOENT) + printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent)); + break; + } + + printk(KERN_DEBUG "\t%d: %s (%s)\n", + count++, dent->name, get_dent_type(dent->type)); + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); } void dbg_dump_node(const struct ubifs_info *c, const void *node) @@ -278,7 +320,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) union ubifs_key key; const struct ubifs_ch *ch = node; - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; /* If the magic is incorrect, just hexdump the first bytes */ @@ -834,7 +876,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) struct ubifs_scan_node *snod; void *buf; - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", @@ -1080,6 +1122,7 @@ out: /** * dbg_check_synced_i_size - check synchronized inode size. + * @c: UBIFS file-system description object * @inode: inode to check * * If inode is clean, synchronized inode size has to be equivalent to current @@ -1087,12 +1130,12 @@ out: * has to be locked). Returns %0 if synchronized inode size if correct, and * %-EINVAL if not. */ -int dbg_check_synced_i_size(struct inode *inode) +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) { int err = 0; struct ubifs_inode *ui = ubifs_inode(inode); - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISREG(inode->i_mode)) return 0; @@ -1125,7 +1168,7 @@ int dbg_check_synced_i_size(struct inode *inode) * Note, it is good idea to make sure the @dir->i_mutex is locked before * calling this function. */ -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) { unsigned int nlink = 2; union ubifs_key key; @@ -1133,7 +1176,7 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) struct qstr nm = { .name = NULL }; loff_t size = UBIFS_INO_NODE_SZ; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISDIR(dir->i_mode)) @@ -1167,12 +1210,14 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) "but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); + dbg_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { ubifs_err("directory inode %lu has nlink %u, but calculated " "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + dbg_dump_inode(c, dir); dump_stack(); return -EINVAL; } @@ -1489,7 +1534,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) long clean_cnt = 0, dirty_cnt = 0; int err, last; - if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + if (!dbg_is_chk_index(c)) return 0; ubifs_assert(mutex_is_locked(&c->tnc_mutex)); @@ -1736,7 +1781,7 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) int err; long long calc = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + if (!dbg_is_chk_index(c)) return 0; err = dbg_walk_index(c, NULL, add_size, &calc); @@ -2312,7 +2357,7 @@ int dbg_check_filesystem(struct ubifs_info *c) int err; struct fsck_data fsckd; - if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + if (!dbg_is_chk_fs(c)) return 0; fsckd.inodes = RB_ROOT; @@ -2347,7 +2392,7 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) struct list_head *cur; struct ubifs_scan_node *sa, *sb; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; for (cur = head->next; cur->next != head; cur = cur->next) { @@ -2414,7 +2459,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) struct list_head *cur; struct ubifs_scan_node *sa, *sb; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; for (cur = head->next; cur->next != head; cur = cur->next) { @@ -2491,214 +2536,141 @@ error_dump: return 0; } -int dbg_force_in_the_gaps(void) +static inline int chance(unsigned int n, unsigned int out_of) { - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) - return 0; + return !!((random32() % out_of) + 1 <= n); - return !(random32() & 7); } -/* Failure mode for recovery testing */ - -#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) - -struct failure_mode_info { - struct list_head list; - struct ubifs_info *c; -}; - -static LIST_HEAD(fmi_list); -static DEFINE_SPINLOCK(fmi_lock); - -static unsigned int next; - -static int simple_rand(void) -{ - if (next == 0) - next = current->pid; - next = next * 1103515245 + 12345; - return (next >> 16) & 32767; -} - -static void failure_mode_init(struct ubifs_info *c) -{ - struct failure_mode_info *fmi; - - fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); - if (!fmi) { - ubifs_err("Failed to register failure mode - no memory"); - return; - } - fmi->c = c; - spin_lock(&fmi_lock); - list_add_tail(&fmi->list, &fmi_list); - spin_unlock(&fmi_lock); -} - -static void failure_mode_exit(struct ubifs_info *c) +static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) { - struct failure_mode_info *fmi, *tmp; - - spin_lock(&fmi_lock); - list_for_each_entry_safe(fmi, tmp, &fmi_list, list) - if (fmi->c == c) { - list_del(&fmi->list); - kfree(fmi); - } - spin_unlock(&fmi_lock); -} - -static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) -{ - struct failure_mode_info *fmi; - - spin_lock(&fmi_lock); - list_for_each_entry(fmi, &fmi_list, list) - if (fmi->c->ubi == desc) { - struct ubifs_info *c = fmi->c; - - spin_unlock(&fmi_lock); - return c; - } - spin_unlock(&fmi_lock); - return NULL; -} - -static int in_failure_mode(struct ubi_volume_desc *desc) -{ - struct ubifs_info *c = dbg_find_info(desc); - - if (c && dbg_failure_mode) - return c->dbg->failure_mode; - return 0; -} + struct ubifs_debug_info *d = c->dbg; -static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) -{ - struct ubifs_info *c = dbg_find_info(desc); - struct ubifs_debug_info *d; + ubifs_assert(dbg_is_tst_rcvry(c)); - if (!c || !dbg_failure_mode) - return 0; - d = c->dbg; - if (d->failure_mode) - return 1; - if (!d->fail_cnt) { - /* First call - decide delay to failure */ + if (!d->pc_cnt) { + /* First call - decide delay to the power cut */ if (chance(1, 2)) { - unsigned int delay = 1 << (simple_rand() >> 11); + unsigned long delay; if (chance(1, 2)) { - d->fail_delay = 1; - d->fail_timeout = jiffies + - msecs_to_jiffies(delay); - dbg_rcvry("failing after %ums", delay); + d->pc_delay = 1; + /* Fail withing 1 minute */ + delay = random32() % 60000; + d->pc_timeout = jiffies; + d->pc_timeout += msecs_to_jiffies(delay); + ubifs_warn("failing after %lums", delay); } else { - d->fail_delay = 2; - d->fail_cnt_max = delay; - dbg_rcvry("failing after %u calls", delay); + d->pc_delay = 2; + delay = random32() % 10000; + /* Fail within 10000 operations */ + d->pc_cnt_max = delay; + ubifs_warn("failing after %lu calls", delay); } } - d->fail_cnt += 1; + + d->pc_cnt += 1; } + /* Determine if failure delay has expired */ - if (d->fail_delay == 1) { - if (time_before(jiffies, d->fail_timeout)) + if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout)) return 0; - } else if (d->fail_delay == 2) - if (d->fail_cnt++ < d->fail_cnt_max) + if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max) return 0; + if (lnum == UBIFS_SB_LNUM) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(19, 20)) + if (write && chance(1, 2)) + return 0; + if (chance(19, 20)) return 0; - dbg_rcvry("failing in super block LEB %d", lnum); + ubifs_warn("failing in super block LEB %d", lnum); } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in master LEB %d", lnum); + ubifs_warn("failing in master LEB %d", lnum); } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { - if (write) { - if (chance(99, 100)) - return 0; - } else if (chance(399, 400)) + if (write && chance(99, 100)) return 0; - dbg_rcvry("failing in log LEB %d", lnum); + if (chance(399, 400)) + return 0; + ubifs_warn("failing in log LEB %d", lnum); } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { - if (write) { - if (chance(7, 8)) - return 0; - } else if (chance(19, 20)) + if (write && chance(7, 8)) return 0; - dbg_rcvry("failing in LPT LEB %d", lnum); + if (chance(19, 20)) + return 0; + ubifs_warn("failing in LPT LEB %d", lnum); } else if (lnum >= c->orph_first && lnum <= c->orph_last) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(9, 10)) + if (write && chance(1, 2)) + return 0; + if (chance(9, 10)) return 0; - dbg_rcvry("failing in orphan LEB %d", lnum); + ubifs_warn("failing in orphan LEB %d", lnum); } else if (lnum == c->ihead_lnum) { if (chance(99, 100)) return 0; - dbg_rcvry("failing in index head LEB %d", lnum); + ubifs_warn("failing in index head LEB %d", lnum); } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { if (chance(9, 10)) return 0; - dbg_rcvry("failing in GC head LEB %d", lnum); + ubifs_warn("failing in GC head LEB %d", lnum); } else if (write && !RB_EMPTY_ROOT(&c->buds) && !ubifs_search_bud(c, lnum)) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in non-bud LEB %d", lnum); + ubifs_warn("failing in non-bud LEB %d", lnum); } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || c->cmt_state == COMMIT_RUNNING_REQUIRED) { if (chance(999, 1000)) return 0; - dbg_rcvry("failing in bud LEB %d commit running", lnum); + ubifs_warn("failing in bud LEB %d commit running", lnum); } else { if (chance(9999, 10000)) return 0; - dbg_rcvry("failing in bud LEB %d commit not running", lnum); + ubifs_warn("failing in bud LEB %d commit not running", lnum); } - ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); - d->failure_mode = 1; + + d->pc_happened = 1; + ubifs_warn("========== Power cut emulated =========="); dump_stack(); return 1; } -static void cut_data(const void *buf, int len) +static void cut_data(const void *buf, unsigned int len) { - int flen, i; + unsigned int from, to, i, ffs = chance(1, 2); unsigned char *p = (void *)buf; - flen = (len * (long long)simple_rand()) >> 15; - for (i = flen; i < len; i++) - p[i] = 0xff; -} + from = random32() % (len + 1); + if (chance(1, 2)) + to = random32() % (len - from + 1); + else + to = len; -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check) -{ - if (in_failure_mode(desc)) - return -EROFS; - return ubi_leb_read(desc, lnum, buf, offset, len, check); + if (from < to) + ubifs_warn("filled bytes %u-%u with %s", from, to - 1, + ffs ? "0xFFs" : "random data"); + + if (ffs) + for (i = from; i < to; i++) + p[i] = 0xFF; + else + for (i = from; i < to; i++) + p[i] = random32() % 0x100; } -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype) +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, + int offs, int len, int dtype) { int err, failing; - if (in_failure_mode(desc)) + if (c->dbg->pc_happened) return -EROFS; - failing = do_fail(desc, lnum, 1); + + failing = power_cut_emulated(c, lnum, 1); if (failing) cut_data(buf, len); - err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); if (err) return err; if (failing) @@ -2706,162 +2678,207 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, return 0; } -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, int dtype) { int err; - if (do_fail(desc, lnum, 1)) + if (c->dbg->pc_happened) return -EROFS; - err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); if (err) return err; - if (do_fail(desc, lnum, 1)) + if (power_cut_emulated(c, lnum, 1)) return -EROFS; return 0; } -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_unmap(struct ubifs_info *c, int lnum) { int err; - if (do_fail(desc, lnum, 0)) + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) return -EROFS; - err = ubi_leb_erase(desc, lnum); + err = ubi_leb_unmap(c->ubi, lnum); if (err) return err; - if (do_fail(desc, lnum, 0)) + if (power_cut_emulated(c, lnum, 0)) return -EROFS; return 0; } -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) { int err; - if (do_fail(desc, lnum, 0)) + if (c->dbg->pc_happened) return -EROFS; - err = ubi_leb_unmap(desc, lnum); + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_map(c->ubi, lnum, dtype); if (err) return err; - if (do_fail(desc, lnum, 0)) + if (power_cut_emulated(c, lnum, 0)) return -EROFS; return 0; } -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) -{ - if (in_failure_mode(desc)) - return -EROFS; - return ubi_is_mapped(desc, lnum); -} +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +static int dfs_file_open(struct inode *inode, struct file *file) { - int err; - - if (do_fail(desc, lnum, 0)) - return -EROFS; - err = ubi_leb_map(desc, lnum, dtype); - if (err) - return err; - if (do_fail(desc, lnum, 0)) - return -EROFS; - return 0; + file->private_data = inode->i_private; + return nonseekable_open(inode, file); } /** - * ubifs_debugging_init - initialize UBIFS debugging. - * @c: UBIFS file-system description object + * provide_user_output - provide output to the user reading a debugfs file. + * @val: boolean value for the answer + * @u: the buffer to store the answer at + * @count: size of the buffer + * @ppos: position in the @u output buffer * - * This function initializes debugging-related data for the file system. - * Returns zero in case of success and a negative error code in case of + * This is a simple helper function which stores @val boolean value in the user + * buffer when the user reads one of UBIFS debugfs files. Returns amount of + * bytes written to @u in case of success and a negative error code in case of * failure. */ -int ubifs_debugging_init(struct ubifs_info *c) +static int provide_user_output(int val, char __user *u, size_t count, + loff_t *ppos) { - c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); - if (!c->dbg) - return -ENOMEM; + char buf[3]; - failure_mode_init(c); - return 0; + if (val) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + + return simple_read_from_buffer(u, count, ppos, buf, 2); } -/** - * ubifs_debugging_exit - free debugging data. - * @c: UBIFS file-system description object - */ -void ubifs_debugging_exit(struct ubifs_info *c) +static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) { - failure_mode_exit(c); - kfree(c->dbg); -} + struct dentry *dent = file->f_path.dentry; + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + int val; + + if (dent == d->dfs_chk_gen) + val = d->chk_gen; + else if (dent == d->dfs_chk_index) + val = d->chk_index; + else if (dent == d->dfs_chk_orph) + val = d->chk_orph; + else if (dent == d->dfs_chk_lprops) + val = d->chk_lprops; + else if (dent == d->dfs_chk_fs) + val = d->chk_fs; + else if (dent == d->dfs_tst_rcvry) + val = d->tst_rcvry; + else + return -EINVAL; -/* - * Root directory for UBIFS stuff in debugfs. Contains sub-directories which - * contain the stuff specific to particular file-system mounts. - */ -static struct dentry *dfs_rootdir; + return provide_user_output(val, u, count, ppos); +} /** - * dbg_debugfs_init - initialize debugfs file-system. + * interpret_user_input - interpret user debugfs file input. + * @u: user-provided buffer with the input + * @count: buffer size * - * UBIFS uses debugfs file-system to expose various debugging knobs to - * user-space. This function creates "ubifs" directory in the debugfs - * file-system. Returns zero in case of success and a negative error code in - * case of failure. + * This is a helper function which interpret user input to a boolean UBIFS + * debugfs file. Returns %0 or %1 in case of success and a negative error code + * in case of failure. */ -int dbg_debugfs_init(void) +static int interpret_user_input(const char __user *u, size_t count) { - dfs_rootdir = debugfs_create_dir("ubifs", NULL); - if (IS_ERR(dfs_rootdir)) { - int err = PTR_ERR(dfs_rootdir); - ubifs_err("cannot create \"ubifs\" debugfs directory, " - "error %d\n", err); - return err; - } + size_t buf_size; + char buf[8]; - return 0; -} + buf_size = min_t(size_t, count, (sizeof(buf) - 1)); + if (copy_from_user(buf, u, buf_size)) + return -EFAULT; -/** - * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. - */ -void dbg_debugfs_exit(void) -{ - debugfs_remove(dfs_rootdir); -} + if (buf[0] == '1') + return 1; + else if (buf[0] == '0') + return 0; -static int open_debugfs_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return nonseekable_open(inode, file); + return -EINVAL; } -static ssize_t write_debugfs_file(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t dfs_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) { struct ubifs_info *c = file->private_data; struct ubifs_debug_info *d = c->dbg; + struct dentry *dent = file->f_path.dentry; + int val; - if (file->f_path.dentry == d->dfs_dump_lprops) + /* + * TODO: this is racy - the file-system might have already been + * unmounted and we'd oops in this case. The plan is to fix it with + * help of 'iterate_supers_type()' which we should have in v3.0: when + * a debugfs opened, we rember FS's UUID in file->private_data. Then + * whenever we access the FS via a debugfs file, we iterate all UBIFS + * superblocks and fine the one with the same UUID, and take the + * locking right. + * + * The other way to go suggested by Al Viro is to create a separate + * 'ubifs-debug' file-system instead. + */ + if (file->f_path.dentry == d->dfs_dump_lprops) { dbg_dump_lprops(c); - else if (file->f_path.dentry == d->dfs_dump_budg) + return count; + } + if (file->f_path.dentry == d->dfs_dump_budg) { dbg_dump_budg(c, &c->bi); - else if (file->f_path.dentry == d->dfs_dump_tnc) { + return count; + } + if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); dbg_dump_tnc(c); mutex_unlock(&c->tnc_mutex); - } else + return count; + } + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == d->dfs_chk_gen) + d->chk_gen = val; + else if (dent == d->dfs_chk_index) + d->chk_index = val; + else if (dent == d->dfs_chk_orph) + d->chk_orph = val; + else if (dent == d->dfs_chk_lprops) + d->chk_lprops = val; + else if (dent == d->dfs_chk_fs) + d->chk_fs = val; + else if (dent == d->dfs_tst_rcvry) + d->tst_rcvry = val; + else return -EINVAL; return count; } static const struct file_operations dfs_fops = { - .open = open_debugfs_file, - .write = write_debugfs_file, + .open = dfs_file_open, + .read = dfs_file_read, + .write = dfs_file_write, .owner = THIS_MODULE, .llseek = no_llseek, }; @@ -2880,12 +2897,20 @@ static const struct file_operations dfs_fops = { */ int dbg_debugfs_init_fs(struct ubifs_info *c) { - int err; + int err, n; const char *fname; struct dentry *dent; struct ubifs_debug_info *d = c->dbg; - sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + if (n == UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + fname = UBIFS_DFS_DIR_NAME; + dent = ERR_PTR(-EINVAL); + goto out; + } + fname = d->dfs_dir_name; dent = debugfs_create_dir(fname, dfs_rootdir); if (IS_ERR_OR_NULL(dent)) @@ -2910,13 +2935,55 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) goto out_remove; d->dfs_dump_tnc = dent; + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_tst_rcvry = dent; + return 0; out_remove: debugfs_remove_recursive(d->dfs_dir); out: err = dent ? PTR_ERR(dent) : -ENODEV; - ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", fname, err); return err; } @@ -2930,4 +2997,179 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c) debugfs_remove_recursive(c->dbg->dfs_dir); } +struct ubifs_global_debug_info ubifs_dbg; + +static struct dentry *dfs_chk_gen; +static struct dentry *dfs_chk_index; +static struct dentry *dfs_chk_orph; +static struct dentry *dfs_chk_lprops; +static struct dentry *dfs_chk_fs; +static struct dentry *dfs_tst_rcvry; + +static ssize_t dfs_global_file_read(struct file *file, char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + if (dent == dfs_chk_gen) + val = ubifs_dbg.chk_gen; + else if (dent == dfs_chk_index) + val = ubifs_dbg.chk_index; + else if (dent == dfs_chk_orph) + val = ubifs_dbg.chk_orph; + else if (dent == dfs_chk_lprops) + val = ubifs_dbg.chk_lprops; + else if (dent == dfs_chk_fs) + val = ubifs_dbg.chk_fs; + else if (dent == dfs_tst_rcvry) + val = ubifs_dbg.tst_rcvry; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); +} + +static ssize_t dfs_global_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == dfs_chk_gen) + ubifs_dbg.chk_gen = val; + else if (dent == dfs_chk_index) + ubifs_dbg.chk_index = val; + else if (dent == dfs_chk_orph) + ubifs_dbg.chk_orph = val; + else if (dent == dfs_chk_lprops) + ubifs_dbg.chk_lprops = val; + else if (dent == dfs_chk_fs) + ubifs_dbg.chk_fs = val; + else if (dent == dfs_tst_rcvry) + ubifs_dbg.tst_rcvry = val; + else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_global_fops = { + .read = dfs_global_file_read, + .write = dfs_global_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + int err; + const char *fname; + struct dentry *dent; + + fname = "ubifs"; + dent = debugfs_create_dir(fname, NULL); + if (IS_ERR_OR_NULL(dent)) + goto out; + dfs_rootdir = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_tst_rcvry = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(dfs_rootdir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + debugfs_remove_recursive(dfs_rootdir); +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + return 0; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + kfree(c->dbg); +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index a811ac4a26bb..45174b534377 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -31,18 +31,25 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, #ifdef CONFIG_UBIFS_FS_DEBUG -#include +/* + * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) /** * ubifs_debug_info - per-FS debugging information. * @old_zroot: old index root - used by 'dbg_check_old_index()' * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' - * @failure_mode: failure mode for recovery testing - * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls - * @fail_timeout: time in jiffies when delay of failure mode expires - * @fail_cnt: current number of calls to failure mode I/O functions - * @fail_cnt_max: number of calls by which to delay failure mode + * + * @pc_happened: non-zero if an emulated power cut happened + * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @pc_timeout: time in jiffies when delay of failure mode expires + * @pc_cnt: current number of calls to failure mode I/O functions + * @pc_cnt_max: number of calls by which to delay failure mode + * * @chk_lpt_sz: used by LPT tree size checker * @chk_lpt_sz2: used by LPT tree size checker * @chk_lpt_wastage: used by LPT tree size checker @@ -56,21 +63,36 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, * @saved_free: saved amount of free space * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + * * @dfs_dir_name: name of debugfs directory containing this file-system's files * @dfs_dir: direntry object of the file-system debugfs directory * @dfs_dump_lprops: "dump lprops" debugfs knob * @dfs_dump_budg: "dump budgeting information" debugfs knob * @dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks + * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks + * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks + * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks + * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks + * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing */ struct ubifs_debug_info { struct ubifs_zbranch old_zroot; int old_zroot_level; unsigned long long old_zroot_sqnum; - int failure_mode; - int fail_delay; - unsigned long fail_timeout; - unsigned int fail_cnt; - unsigned int fail_cnt_max; + + int pc_happened; + int pc_delay; + unsigned long pc_timeout; + unsigned int pc_cnt; + unsigned int pc_cnt_max; + long long chk_lpt_sz; long long chk_lpt_sz2; long long chk_lpt_wastage; @@ -84,11 +106,43 @@ struct ubifs_debug_info { long long saved_free; int saved_idx_gc_cnt; - char dfs_dir_name[100]; + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; + + char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; struct dentry *dfs_dir; struct dentry *dfs_dump_lprops; struct dentry *dfs_dump_budg; struct dentry *dfs_dump_tnc; + struct dentry *dfs_chk_gen; + struct dentry *dfs_chk_index; + struct dentry *dfs_chk_orph; + struct dentry *dfs_chk_lprops; + struct dentry *dfs_chk_fs; + struct dentry *dfs_tst_rcvry; +}; + +/** + * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information. + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + */ +struct ubifs_global_debug_info { + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; }; #define ubifs_assert(expr) do { \ @@ -127,6 +181,8 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) +extern spinlock_t dbg_lock; + #define ubifs_dbg_msg(type, fmt, ...) do { \ spin_lock(&dbg_lock); \ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ @@ -162,41 +218,36 @@ const char *dbg_key_str1(const struct ubifs_info *c, /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) -/* - * Debugging check flags. - * - * UBIFS_CHK_GEN: general checks - * UBIFS_CHK_TNC: check TNC - * UBIFS_CHK_IDX_SZ: check index size - * UBIFS_CHK_ORPH: check orphans - * UBIFS_CHK_OLD_IDX: check the old index - * UBIFS_CHK_LPROPS: check lprops - * UBIFS_CHK_FS: check the file-system - */ -enum { - UBIFS_CHK_GEN = 0x1, - UBIFS_CHK_TNC = 0x2, - UBIFS_CHK_IDX_SZ = 0x4, - UBIFS_CHK_ORPH = 0x8, - UBIFS_CHK_OLD_IDX = 0x10, - UBIFS_CHK_LPROPS = 0x20, - UBIFS_CHK_FS = 0x40, -}; - -/* - * Special testing flags. - * - * UBIFS_TST_RCVRY: failure mode for recovery testing - */ -enum { - UBIFS_TST_RCVRY = 0x4, -}; - -extern spinlock_t dbg_lock; +extern struct ubifs_global_debug_info ubifs_dbg; -extern unsigned int ubifs_msg_flags; -extern unsigned int ubifs_chk_flags; -extern unsigned int ubifs_tst_flags; +static inline int dbg_is_chk_gen(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen); +} +static inline int dbg_is_chk_index(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_index || c->dbg->chk_index); +} +static inline int dbg_is_chk_orph(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph); +} +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops); +} +static inline int dbg_is_chk_fs(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs); +} +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry); +} +static inline int dbg_is_power_cut(const struct ubifs_info *c) +{ + return !!c->dbg->pc_happened; +} int ubifs_debugging_init(struct ubifs_info *c); void ubifs_debugging_exit(struct ubifs_info *c); @@ -207,7 +258,7 @@ const char *dbg_cstate(int cmt_state); const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); +void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, int offs); @@ -240,8 +291,8 @@ int dbg_check_cats(struct ubifs_info *c); int dbg_check_ltab(struct ubifs_info *c); int dbg_chk_lpt_free_spc(struct ubifs_info *c); int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); -int dbg_check_synced_i_size(struct inode *inode); -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode); +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); int dbg_check_filesystem(struct ubifs_info *c); @@ -254,54 +305,12 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); -/* Force the use of in-the-gaps method for testing */ -static inline int dbg_force_in_the_gaps_enabled(void) -{ - return ubifs_chk_flags & UBIFS_CHK_GEN; -} -int dbg_force_in_the_gaps(void); - -/* Failure mode for recovery testing */ -#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) - -#ifndef UBIFS_DBG_PRESERVE_UBI -#define ubi_leb_read dbg_leb_read -#define ubi_leb_write dbg_leb_write -#define ubi_leb_change dbg_leb_change -#define ubi_leb_erase dbg_leb_erase -#define ubi_leb_unmap dbg_leb_unmap -#define ubi_is_mapped dbg_is_mapped -#define ubi_leb_map dbg_leb_map -#endif - -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check); -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype); -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, - int len, int dtype); -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); - -static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, - int offset, int len) -{ - return dbg_leb_read(desc, lnum, buf, offset, len, 0); -} - -static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, - const void *buf, int offset, int len) -{ - return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); -} - -static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, - const void *buf, int len) -{ - return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); -} +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype); +int dbg_leb_unmap(struct ubifs_info *c, int lnum); +int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype); /* Debugfs-related stuff */ int dbg_debugfs_init(void); @@ -313,7 +322,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); /* Use "if (0)" to make compiler check arguments even if debugging is off */ #define ubifs_assert(expr) do { \ - if (0 && (expr)) \ + if (0) \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ } while (0) @@ -323,6 +332,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); ubifs_err(fmt, ##__VA_ARGS__); \ } while (0) +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + #define ubifs_dbg_msg(fmt, ...) do { \ if (0) \ pr_debug(fmt "\n", ##__VA_ARGS__); \ @@ -346,9 +358,6 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) #define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define DBGKEY(key) ((char *)(key)) -#define DBGKEY1(key) ((char *)(key)) - static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } static inline const char *dbg_ntype(int type) { return ""; } @@ -357,7 +366,7 @@ static inline const char *dbg_jhead(int jhead) { return ""; } static inline const char * dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key) { return ""; } -static inline void dbg_dump_inode(const struct ubifs_info *c, +static inline void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { return; } static inline void dbg_dump_node(const struct ubifs_info *c, const void *node) { return; } @@ -409,9 +418,11 @@ static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } static inline int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { return 0; } -static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; } -static inline int dbg_check_dir_size(struct ubifs_info *c, - const struct inode *dir) { return 0; } +static inline int +dbg_check_synced_i_size(const struct ubifs_info *c, + struct inode *inode) { return 0; } +static inline int dbg_check_dir(struct ubifs_info *c, + const struct inode *dir) { return 0; } static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } static inline int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) { return 0; } @@ -431,9 +442,23 @@ static inline int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) { return 0; } -static inline int dbg_force_in_the_gaps(void) { return 0; } -#define dbg_force_in_the_gaps_enabled() 0 -#define dbg_failure_mode 0 +static inline int dbg_leb_write(struct ubifs_info *c, int lnum, + const void *buf, int offset, + int len, int dtype) { return 0; } +static inline int dbg_leb_change(struct ubifs_info *c, int lnum, + const void *buf, int len, + int dtype) { return 0; } +static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; } +static inline int dbg_leb_map(struct ubifs_info *c, int lnum, + int dtype) { return 0; } + +static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; } static inline int dbg_debugfs_init(void) { return 0; } static inline void dbg_debugfs_exit(void) { return; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef5abd38f0bf..683492043317 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -102,7 +102,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, * UBIFS has to fully control "clean <-> dirty" transitions of inodes * to make budgeting work. */ - inode->i_flags |= (S_NOCMTIME); + inode->i_flags |= S_NOCMTIME; inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = @@ -172,9 +172,11 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, #ifdef CONFIG_UBIFS_FS_DEBUG -static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +static int dbg_check_name(const struct ubifs_info *c, + const struct ubifs_dent_node *dent, + const struct qstr *nm) { - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (le16_to_cpu(dent->nlen) != nm->len) return -EINVAL; @@ -185,7 +187,7 @@ static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) #else -#define dbg_check_name(dent, nm) 0 +#define dbg_check_name(c, dent, nm) 0 #endif @@ -219,7 +221,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - if (dbg_check_name(dent, &dentry->d_name)) { + if (dbg_check_name(c, dent, &dentry->d_name)) { err = -EINVAL; goto out; } @@ -522,7 +524,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -577,7 +579,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) inode->i_nlink, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5e7fccfc4b29..7cf738a4544d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1263,7 +1263,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3be645e012c9..9228950a658f 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -86,8 +86,125 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) c->no_chk_data_crc = 0; c->vfs_sb->s_flags |= MS_RDONLY; ubifs_warn("switched to read-only mode, error %d", err); + dump_stack(); + } +} + +/* + * Below are simple wrappers over UBI I/O functions which include some + * additional checks and UBIFS debugging stuff. See corresponding UBI function + * for more information. + */ + +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg) +{ + int err; + + err = ubi_read(c->ubi, lnum, buf, offs, len); + /* + * In case of %-EBADMSG print the error message only if the + * @even_ebadmsg is true. + */ + if (err && (err != -EBADMSG || even_ebadmsg)) { + ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", + len, lnum, offs, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + else + err = dbg_leb_write(c, lnum, buf, offs, len, dtype); + if (err) { + ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", + len, lnum, offs, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + else + err = dbg_leb_change(c, lnum, buf, len, dtype); + if (err) { + ubifs_err("changing %d bytes in LEB %d failed, error %d", + len, lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_unmap(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_unmap(c->ubi, lnum); + else + err = dbg_leb_unmap(c, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_map(c->ubi, lnum, dtype); + else + err = dbg_leb_map(c, lnum, dtype); + if (err) { + ubifs_err("mapping LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_is_mapped(const struct ubifs_info *c, int lnum) +{ + int err; + + err = ubi_is_mapped(c->ubi, lnum); + if (err < 0) { + ubifs_err("ubi_is_mapped failed for LEB %d, error %d", + lnum, err); dbg_dump_stack(); } + return err; } /** @@ -406,14 +523,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dirt = sync_len - wbuf->used; if (dirt) ubifs_pad(c, wbuf->buf + wbuf->used, dirt); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - sync_len, wbuf->dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d", - sync_len, wbuf->lnum, wbuf->offs); - dbg_dump_stack(); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len, + wbuf->dtype); + if (err) return err; - } spin_lock(&wbuf->lock); wbuf->offs += sync_len; @@ -605,9 +718,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len == wbuf->avail) { dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, wbuf->size, - wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, + wbuf->offs, wbuf->size, + wbuf->dtype); if (err) goto out; @@ -642,8 +755,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); if (err) goto out; @@ -661,8 +774,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ dbg_io("write %d bytes to LEB %d:%d", wbuf->size, wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, - wbuf->size, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); if (err) goto out; @@ -683,8 +796,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, - wbuf->offs, n, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, n, wbuf->dtype); if (err) goto out; wbuf->offs += n; @@ -766,13 +879,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, return -EROFS; ubifs_prepare_node(c, buf, len, 1); - err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", - buf_len, lnum, offs, err); + err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype); + if (err) dbg_dump_node(c, buf); - dbg_dump_stack(); - } return err; } @@ -824,13 +933,9 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, if (rlen > 0) { /* Read everything that goes before write-buffer */ - err = ubi_read(c->ubi, lnum, buf, offs, rlen); - if (err && err != -EBADMSG) { - ubifs_err("failed to read node %d from LEB %d:%d, " - "error %d", type, lnum, offs, err); - dbg_dump_stack(); + err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + if (err && err != -EBADMSG) return err; - } } if (type != ch->node_type) { @@ -885,12 +990,9 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); - err = ubi_read(c->ubi, lnum, buf, offs, len); - if (err && err != -EBADMSG) { - ubifs_err("cannot read node %d from LEB %d:%d, error %d", - type, lnum, offs, err); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); + if (err && err != -EBADMSG) return err; - } if (type != ch->node_type) { ubifs_err("bad node type (%d but expected %d)", diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index affea9494ae2..f9fd068d1ae0 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -262,7 +262,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) * an unclean reboot, because the target LEB might have been * unmapped, but not yet physically erased. */ - err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM); if (err) goto out_unlock; } @@ -283,8 +283,6 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) return 0; out_unlock: - if (err != -EAGAIN) - ubifs_ro_mode(c, err); mutex_unlock(&c->log_mutex); kfree(ref); kfree(bud); @@ -752,7 +750,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) struct ubifs_bud *bud; long long bud_bytes = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; spin_lock(&c->buds_lock); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 667884f4a615..f8a181e647cc 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -504,7 +504,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) pnode = (struct ubifs_pnode *)container_of(lprops - pos, struct ubifs_pnode, lprops[0]); - return !test_bit(COW_ZNODE, &pnode->flags) && + return !test_bit(COW_CNODE, &pnode->flags) && test_bit(DIRTY_CNODE, &pnode->flags); } @@ -860,7 +860,7 @@ int dbg_check_cats(struct ubifs_info *c) struct list_head *pos; int i, cat; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return 0; list_for_each_entry(lprops, &c->empty_list, list) { @@ -958,7 +958,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, { int i = 0, j, err = 0; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return; for (i = 0; i < heap->cnt; i++) { @@ -1262,7 +1262,7 @@ int dbg_check_lprops(struct ubifs_info *c) int i, err; struct ubifs_lp_stats lst; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index ef5155e109a2..6189c74d97f0 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -701,8 +701,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -732,8 +732,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -780,8 +780,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -806,7 +806,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM); if (err) goto out; p = buf; @@ -826,7 +826,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Write remaining buffer */ memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM); if (err) goto out; @@ -1222,7 +1222,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1); if (err) goto out; err = ubifs_unpack_nnode(c, buf, nnode); @@ -1247,6 +1247,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + dbg_dump_stack(); kfree(nnode); return err; } @@ -1290,7 +1291,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) lprops->flags = ubifs_categorize_lprops(c, lprops); } } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1); if (err) goto out; err = unpack_pnode(c, buf, pnode); @@ -1312,6 +1313,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); dbg_dump_pnode(c, pnode, parent, iip); + dbg_dump_stack(); dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; @@ -1331,7 +1333,7 @@ static int read_ltab(struct ubifs_info *c) buf = vmalloc(c->ltab_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1); if (err) goto out; err = unpack_ltab(c, buf); @@ -1354,7 +1356,8 @@ static int read_lsave(struct ubifs_info *c) buf = vmalloc(c->lsave_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs, + c->lsave_sz, 1); if (err) goto out; err = unpack_lsave(c, buf); @@ -1814,8 +1817,8 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->nnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->nnode_sz, 1); if (err) return ERR_PTR(err); err = ubifs_unpack_nnode(c, buf, nnode); @@ -1883,8 +1886,8 @@ static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, ubifs_assert(branch->lnum >= c->lpt_first && branch->lnum <= c->lpt_last); ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->pnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->pnode_sz, 1); if (err) return ERR_PTR(err); err = unpack_pnode(c, buf, pnode); @@ -2224,7 +2227,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, struct ubifs_cnode *cn; int num, iip = 0, err; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; while (cnode) { diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index dfcb5748a7dc..cddd6bd214f4 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -27,6 +27,7 @@ #include #include +#include #include "ubifs.h" #ifdef CONFIG_UBIFS_FS_DEBUG @@ -116,8 +117,8 @@ static int get_cnodes_to_commit(struct ubifs_info *c) return 0; cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); - __set_bit(COW_ZNODE, &cnode->flags); + ubifs_assert(!test_bit(COW_CNODE, &cnode->flags)); + __set_bit(COW_CNODE, &cnode->flags); cnext = next_dirty_cnode(cnode); if (!cnext) { cnode->cnext = c->lpt_cnext; @@ -465,7 +466,7 @@ static int write_cnodes(struct ubifs_info *c) */ clear_bit(DIRTY_CNODE, &cnode->flags); smp_mb__before_clear_bit(); - clear_bit(COW_ZNODE, &cnode->flags); + clear_bit(COW_CNODE, &cnode->flags); smp_mb__after_clear_bit(); offs += len; dbg_chk_lpt_sz(c, 1, len); @@ -1160,11 +1161,11 @@ static int lpt_gc_lnum(struct ubifs_info *c, int lnum) void *buf = c->lpt_buf; dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - ubifs_err("cannot read LEB %d, error %d", lnum, err); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) return err; - } + while (1) { if (!is_a_node(c, buf, len)) { int pad_len; @@ -1640,7 +1641,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) int ret; void *buf, *p; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); @@ -1650,11 +1651,11 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) } dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) goto out; - } + while (1) { if (!is_a_node(c, p, len)) { int i, pad_len; @@ -1711,7 +1712,7 @@ int dbg_check_ltab(struct ubifs_info *c) { int lnum, err, i, cnt; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* Bring the entire tree into memory */ @@ -1754,7 +1755,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) long long free = 0; int i; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; for (i = 0; i < c->lpt_lebs; i++) { @@ -1796,7 +1797,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) long long chk_lpt_sz, lpt_sz; int err = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; switch (action) { @@ -1901,11 +1902,10 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) return; } - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - ubifs_err("cannot read LEB %d, error %d", lnum, err); + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) goto out; - } + while (1) { offs = c->leb_size - len; if (!is_a_node(c, p, len)) { @@ -2019,7 +2019,7 @@ static int dbg_populate_lsave(struct ubifs_info *c) struct ubifs_lpt_heap *heap; int i; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (random32() & 3) return 0; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 0b5296a9a4c5..ee7cb5ebb6e8 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -38,6 +38,29 @@ static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) return !!test_bit(DIRTY_ZNODE, &znode->flags); } +/** + * ubifs_zn_obsolete - check if znode is obsolete. + * @znode: znode to check + * + * This helper function returns %1 if @znode is obsolete and %0 otherwise. + */ +static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode) +{ + return !!test_bit(OBSOLETE_ZNODE, &znode->flags); +} + +/** + * ubifs_zn_cow - check if znode has to be copied on write. + * @znode: znode to check + * + * This helper function returns %1 if @znode is has COW flag set and %0 + * otherwise. + */ +static inline int ubifs_zn_cow(const struct ubifs_znode *znode) +{ + return !!test_bit(COW_ZNODE, &znode->flags); +} + /** * ubifs_wake_up_bgt - wake up background thread. * @c: UBIFS file-system description object @@ -121,86 +144,6 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) return err; } -/** - * ubifs_leb_unmap - unmap an LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to unmap - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_unmap(c->ubi, lnum); - if (err) { - ubifs_err("unmap LEB %d failed, error %d", lnum, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_write - write to a LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @offs: offset within LEB to write to - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, - const void *buf, int offs, int len, int dtype) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); - if (err) { - ubifs_err("writing %d bytes at %d:%d, error %d", - len, lnum, offs, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_change - atomic LEB change. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, - const void *buf, int len, int dtype) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); - if (err) { - ubifs_err("changing %d bytes in LEB %d, error %d", - len, lnum, err); - return err; - } - - return 0; -} - /** * ubifs_encode_dev - encode device node IDs. * @dev: UBIFS device node information diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index a5422fffbd69..c542c73cfa3c 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -929,7 +929,7 @@ static int dbg_check_orphans(struct ubifs_info *c) struct check_info ci; int err; - if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + if (!dbg_is_chk_orph(c)) return 0; ci.last_ino = 0; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 783d8e0beb76..af02790d9328 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -117,7 +117,7 @@ static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, if (!sbuf) return -ENOMEM; - err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0); if (err && err != -EBADMSG) goto out_free; @@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); - err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM); if (err) goto out; - err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM); if (err) goto out; out: @@ -274,7 +274,8 @@ int ubifs_recover_master_node(struct ubifs_info *c) if (cor1) goto out_err; mst = mst1; - } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + } else if (offs1 == 0 && + c->leb_size - offs2 - sz < sz) { /* 1st LEB was unmapped and written, 2nd not */ if (cor1) goto out_err; @@ -539,8 +540,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, int len = ALIGN(endpt, c->min_io_size); if (start) { - err = ubi_read(c->ubi, lnum, sleb->buf, 0, - start); + err = ubifs_leb_read(c, lnum, sleb->buf, 0, + start, 1); if (err) return err; } @@ -554,8 +555,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_pad(c, buf, pad_len); } } - err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, - UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sleb->buf, len, + UBI_UNKNOWN); if (err) return err; } @@ -819,7 +820,8 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, return -ENOMEM; if (c->leb_size - offs < UBIFS_CS_NODE_SZ) goto out_err; - err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + err = ubifs_leb_read(c, lnum, (void *)cs_node, offs, + UBIFS_CS_NODE_SZ, 0); if (err && err != -EBADMSG) goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); @@ -919,8 +921,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, * * This function returns %0 on success and a negative error code on failure. */ -static int recover_head(const struct ubifs_info *c, int lnum, int offs, - void *sbuf) +static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) { int len = c->max_write_size, err; @@ -931,15 +932,15 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, return 0; /* Read at the head location and check it is empty flash */ - err = ubi_read(c->ubi, lnum, sbuf, offs, len); + err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1); if (err || !is_empty(sbuf, len)) { dbg_rcvry("cleaning head at %d:%d", lnum, offs); if (offs == 0) return ubifs_leb_unmap(c, lnum); - err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); if (err) return err; - return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN); } return 0; @@ -962,7 +963,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) { int err; @@ -993,7 +994,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) * * This function returns %0 on success and a negative error code on failure. */ -static int clean_an_unclean_leb(const struct ubifs_info *c, +static int clean_an_unclean_leb(struct ubifs_info *c, struct ubifs_unclean_leb *ucleb, void *sbuf) { int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; @@ -1009,7 +1010,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, return 0; } - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); if (err && err != -EBADMSG) return err; @@ -1069,7 +1070,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, } /* Write back the LEB atomically */ - err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN); if (err) return err; @@ -1089,7 +1090,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf) { dbg_rcvry("recovery"); while (!list_empty(&c->unclean_leb_list)) { @@ -1454,7 +1455,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) if (i_size >= e->d_size) return 0; /* Read the LEB */ - err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1); if (err) goto out; /* Change the size field and recalculate the CRC */ @@ -1470,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) len -= 1; len = ALIGN(len + 1, c->min_io_size); /* Atomically write the fixed LEB back again */ - err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); if (err) goto out; dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 5e97161ce4d3..ccabaf1164b3 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -523,8 +523,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) if (!list_is_last(&next->list, &jh->buds_list)) return 0; - err = ubi_read(c->ubi, next->lnum, (char *)&data, - next->start, 4); + err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1); if (err) return 0; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c606f010e8df..93d938ad3d2a 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -674,15 +674,15 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len) if (len == 0) { dbg_mnt("unmap empty LEB %d", lnum); - return ubi_leb_unmap(c->ubi, lnum); + return ubifs_leb_unmap(c, lnum); } dbg_mnt("fixup LEB %d, data len %d", lnum, len); - err = ubi_read(c->ubi, lnum, c->sbuf, 0, len); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1); if (err) return err; - return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); } /** diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 36216b46f772..37383e8011b1 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -148,7 +148,7 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, INIT_LIST_HEAD(&sleb->nodes); sleb->buf = sbuf; - err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); if (err && err != -EBADMSG) { ubifs_err("cannot read %d bytes from LEB %d:%d," " error %d", c->leb_size - offs, lnum, offs, err); @@ -240,7 +240,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, int len; ubifs_err("corruption at LEB %d:%d", lnum, offs); - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; len = c->leb_size - offs; if (len > 8192) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 529be0582029..b28121278d46 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -85,7 +85,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; - if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(ui->compr_type)) { @@ -94,7 +94,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) ubifs_compr_name(ui->compr_type)); } - err = dbg_check_dir_size(c, inode); + err = dbg_check_dir(c, inode); return err; } @@ -914,7 +914,7 @@ static int check_volume_empty(struct ubifs_info *c) c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { - err = ubi_is_mapped(c->ubi, lnum); + err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 91b4213dde84..066738647685 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -223,7 +223,7 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); __set_bit(OBSOLETE_ZNODE, &znode->flags); if (znode->level != 0) { @@ -271,7 +271,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, struct ubifs_znode *zn; int err; - if (!test_bit(COW_ZNODE, &znode->flags)) { + if (!ubifs_zn_cow(znode)) { /* znode is not being committed */ if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { atomic_long_inc(&c->dirty_zn_cnt); @@ -462,7 +462,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 1); if (err) { ubifs_err("cannot read node type %d from LEB %d:%d, error %d", type, lnum, offs, err); @@ -1666,7 +1666,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, if (!overlap) { /* We may safely unlock the write-buffer and read the data */ spin_unlock(&wbuf->lock); - return ubi_read(c->ubi, lnum, buf, offs, len); + return ubifs_leb_read(c, lnum, buf, offs, len, 0); } /* Don't read under wbuf */ @@ -1680,7 +1680,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, if (rlen > 0) /* Read everything that goes before write-buffer */ - return ubi_read(c->ubi, lnum, buf, offs, rlen); + return ubifs_leb_read(c, lnum, buf, offs, rlen, 0); return 0; } @@ -1767,7 +1767,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) if (wbuf) err = read_wbuf(wbuf, bu->buf, len, lnum, offs); else - err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0); /* Check for a race with GC */ if (maybe_leb_gced(c, lnum, bu->gc_seq)) @@ -2423,7 +2423,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) */ do { - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); ubifs_assert(ubifs_zn_dirty(znode)); zp = znode->parent; @@ -2479,9 +2479,8 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) c->zroot.offs = zbr->offs; c->zroot.len = zbr->len; c->zroot.znode = znode; - ubifs_assert(!test_bit(OBSOLETE_ZNODE, - &zp->flags)); - ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + ubifs_assert(!ubifs_zn_obsolete(zp)); + ubifs_assert(ubifs_zn_dirty(zp)); atomic_long_dec(&c->dirty_zn_cnt); if (zp->cnext) { @@ -2865,7 +2864,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) struct ubifs_znode *znode = cnext; cnext = cnext->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); } while (cnext && cnext != c->cnext); } @@ -3301,7 +3300,7 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, if (!S_ISREG(inode->i_mode)) return 0; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; @@ -3337,9 +3336,10 @@ out_dump: ubifs_err("inode %lu has size %lld, but there are data at offset %lld " "(data key %s)", (unsigned long)inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + mutex_unlock(&c->tnc_mutex); dbg_dump_inode(c, inode); dbg_dump_stack(); - err = -EINVAL; + return -EINVAL; out_unlock: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 41920f357bbf..4c15f07a8bb2 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -22,6 +22,7 @@ /* This file implements TNC functions for committing */ +#include #include "ubifs.h" /** @@ -87,8 +88,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, atomic_long_dec(&c->dirty_zn_cnt); ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); + /* + * Note, unlike 'write_index()' we do not add memory barriers here + * because this function is called with @c->tnc_mutex locked. + */ __clear_bit(DIRTY_ZNODE, &znode->flags); __clear_bit(COW_ZNODE, &znode->flags); @@ -377,7 +382,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) c->gap_lebs = NULL; return err; } - if (dbg_force_in_the_gaps_enabled()) { + if (!dbg_is_chk_index(c)) { /* * Do not print scary warnings if the debugging * option which forces in-the-gaps is enabled. @@ -491,25 +496,6 @@ static int layout_in_empty_space(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - buf_offs += ALIGN(len, 8); - if (next_len) { - if (buf_offs + next_len <= c->leb_size) - continue; - err = ubifs_update_one_lp(c, lnum, 0, - c->leb_size - buf_offs, 0, 0); - if (err) - return err; - lnum = -1; - continue; - } - err = ubifs_update_one_lp(c, lnum, - c->leb_size - buf_offs, 0, 0, 0); - if (err) - return err; - break; - } - /* Update buffer positions */ wlen = used + len; used += ALIGN(len, 8); @@ -658,7 +644,7 @@ static int get_znodes_to_commit(struct ubifs_info *c) } cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_cow(znode)); __set_bit(COW_ZNODE, &znode->flags); znode->alt = 0; cnext = find_next_dirty(znode); @@ -704,7 +690,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_force_in_the_gaps()) + if (dbg_is_chk_index(c) && !(random32() & 7)) return -ENOSPC; return 0; } @@ -830,7 +816,7 @@ static int write_index(struct ubifs_info *c) struct ubifs_idx_node *idx; struct ubifs_znode *znode, *cnext; int i, lnum, offs, len, next_len, buf_len, buf_offs, used; - int avail, wlen, err, lnum_pos = 0; + int avail, wlen, err, lnum_pos = 0, blen, nxt_offs; cnext = c->enext; if (!cnext) @@ -907,7 +893,7 @@ static int write_index(struct ubifs_info *c) cnext = znode->cnext; ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); /* * It is important that other threads should see %DIRTY_ZNODE @@ -922,6 +908,28 @@ static int write_index(struct ubifs_info *c) clear_bit(COW_ZNODE, &znode->flags); smp_mb__after_clear_bit(); + /* + * We have marked the znode as clean but have not updated the + * @c->clean_zn_cnt counter. If this znode becomes dirty again + * before 'free_obsolete_znodes()' is called, then + * @c->clean_zn_cnt will be decremented before it gets + * incremented (resulting in 2 decrements for the same znode). + * This means that @c->clean_zn_cnt may become negative for a + * while. + * + * Q: why we cannot increment @c->clean_zn_cnt? + * A: because we do not have the @c->tnc_mutex locked, and the + * following code would be racy and buggy: + * + * if (!ubifs_zn_obsolete(znode)) { + * atomic_long_inc(&c->clean_zn_cnt); + * atomic_long_inc(&ubifs_clean_zn_cnt); + * } + * + * Thus, we just delay the @c->clean_zn_cnt update until we + * have the mutex locked. + */ + /* Do not access znode from this point on */ /* Update buffer positions */ @@ -938,65 +946,38 @@ static int write_index(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - /* - * Write the prepared index node immediately if there is - * no minimum IO size - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - wlen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += ALIGN(wlen, 8); - if (next_len) { - used = 0; - avail = buf_len; - if (buf_offs + next_len > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } + nxt_offs = buf_offs + used + next_len; + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) continue; - } + else + blen = buf_len; } else { - int blen, nxt_offs = buf_offs + used + next_len; - - if (next_len && nxt_offs <= c->leb_size) { - if (avail > 0) - continue; - else - blen = buf_len; - } else { - wlen = ALIGN(wlen, 8); - blen = ALIGN(wlen, c->min_io_size); - ubifs_pad(c, c->cbuf + wlen, blen - wlen); - } - /* - * The buffer is full or there are no more znodes - * to do - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - blen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += blen; - if (next_len) { - if (nxt_offs > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } - used -= blen; - if (used < 0) - used = 0; - avail = buf_len - used; - memmove(c->cbuf, c->cbuf + blen, used); - continue; + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + + /* The buffer is full or there are no more znodes to do */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen, + UBI_SHORTTERM); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, + 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; } break; } @@ -1029,7 +1010,7 @@ static void free_obsolete_znodes(struct ubifs_info *c) do { znode = cnext; cnext = znode->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); else { znode->cnext = NULL; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f79983d6f860..702b79258e30 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -230,14 +230,14 @@ enum { * LPT cnode flag bits. * * DIRTY_CNODE: cnode is dirty - * COW_CNODE: cnode is being committed and must be copied before writing * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), - * so it can (and must) be freed when the commit is finished + * so it can (and must) be freed when the commit is finished + * COW_CNODE: cnode is being committed and must be copied before writing */ enum { DIRTY_CNODE = 0, - COW_CNODE = 1, - OBSOLETE_CNODE = 2, + OBSOLETE_CNODE = 1, + COW_CNODE = 2, }; /* @@ -1468,6 +1468,15 @@ extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg); +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype); +int ubifs_leb_unmap(struct ubifs_info *c, int lnum); +int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype); +int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype); @@ -1747,8 +1756,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int jhead); struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf); -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf); int ubifs_rcvry_gc_commit(struct ubifs_info *c); int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, int deletion, loff_t new_size); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 284a7c89697e..75bb316529dd 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -88,8 +88,6 @@ xfs-y += xfs_alloc.o \ xfs_vnodeops.o \ xfs_rw.o -xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o - # Objects in linux/ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 39f4f809bb68..115ac6919533 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -264,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode) iattr.ia_mode = mode; iattr.ia_ctime = current_fs_time(inode->i_sb); - error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } return error; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 79ce38be15a1..26384fe3f26d 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -181,6 +181,7 @@ xfs_setfilesize( isize = xfs_ioend_new_eof(ioend); if (isize) { + trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); ip->i_d.di_size = isize; xfs_mark_inode_dirty(ip); } @@ -894,11 +895,6 @@ out_invalidate: * For unwritten space on the page we need to start the conversion to * regular allocated space. * For any other dirty buffer heads on the page we should flush them. - * - * If we detect that a transaction would be required to flush the page, we - * have to check the process flags first, if we are already in a transaction - * or disk I/O during allocations is off, we need to fail the writepage and - * redirty the page. */ STATIC int xfs_vm_writepage( @@ -906,7 +902,6 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - int delalloc, unwritten; struct buffer_head *bh, *head; struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; @@ -938,15 +933,10 @@ xfs_vm_writepage( goto redirty; /* - * We need a transaction if there are delalloc or unwritten buffers - * on the page. - * - * If we need a transaction and the process flags say we are already - * in a transaction, or no IO is allowed then mark the page dirty - * again and leave the page as is. + * Given that we do not allow direct reclaim to call us, we should + * never be called while in a filesystem transaction. */ - xfs_count_page_state(page, &delalloc, &unwritten); - if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) + if (WARN_ON(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ @@ -970,7 +960,7 @@ xfs_vm_writepage( offset = page_offset(page); type = IO_OVERWRITE; - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; do { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 5e68099db2a5..b2b411985591 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -499,16 +499,14 @@ found: spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - if (xfs_buf_cond_lock(bp)) { - /* failed, so wait for the lock if requested. */ - if (!(flags & XBF_TRYLOCK)) { - xfs_buf_lock(bp); - XFS_STATS_INC(xb_get_locked_waited); - } else { + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); } /* @@ -594,10 +592,8 @@ _xfs_buf_read( ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); - bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); status = xfs_buf_iorequest(bp); if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) @@ -681,7 +677,6 @@ xfs_buf_read_uncached( return NULL; /* set up the buffer for a read IO */ - xfs_buf_lock(bp); XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); XFS_BUF_BUSY(bp); @@ -816,8 +811,6 @@ xfs_buf_get_uncached( goto fail_free_mem; } - xfs_buf_unlock(bp); - trace_xfs_buf_get_uncached(bp, _RET_IP_); return bp; @@ -896,8 +889,8 @@ xfs_buf_rele( * to push on stale inode buffers. */ int -xfs_buf_cond_lock( - xfs_buf_t *bp) +xfs_buf_trylock( + struct xfs_buf *bp) { int locked; @@ -907,15 +900,8 @@ xfs_buf_cond_lock( else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) xfs_log_force(bp->b_target->bt_mount, 0); - trace_xfs_buf_cond_lock(bp, _RET_IP_); - return locked ? 0 : -EBUSY; -} - -int -xfs_buf_lock_value( - xfs_buf_t *bp) -{ - return bp->b_sema.count; + trace_xfs_buf_trylock(bp, _RET_IP_); + return locked; } /* @@ -929,7 +915,7 @@ xfs_buf_lock_value( */ void xfs_buf_lock( - xfs_buf_t *bp) + struct xfs_buf *bp) { trace_xfs_buf_lock(bp, _RET_IP_); @@ -950,7 +936,7 @@ xfs_buf_lock( */ void xfs_buf_unlock( - xfs_buf_t *bp) + struct xfs_buf *bp) { if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { atomic_inc(&bp->b_hold); @@ -1121,7 +1107,7 @@ xfs_bioerror_relse( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_DONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_iodone = NULL; if (!(fl & XBF_ASYNC)) { /* * Mark b_error and B_ERROR _both_. @@ -1223,23 +1209,21 @@ _xfs_buf_ioapply( total_nr_pages = bp->b_page_count; map_i = 0; - if (bp->b_flags & XBF_ORDERED) { - ASSERT(!(bp->b_flags & XBF_READ)); - rw = WRITE_FLUSH_FUA; - } else if (bp->b_flags & XBF_LOG_BUFFER) { - ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; - } else if (bp->b_flags & _XBF_RUN_QUEUES) { - ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; } else { - rw = (bp->b_flags & XBF_WRITE) ? WRITE : - (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; + rw = READ; } - next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); @@ -1694,15 +1678,14 @@ xfs_buf_delwri_split( list_for_each_entry_safe(bp, n, dwq, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); - if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { + if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); break; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| - _XBF_RUN_QUEUES); + bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); trace_xfs_buf_delwri_split(bp, _RET_IP_); @@ -1738,14 +1721,6 @@ xfs_buf_cmp( return 0; } -void -xfs_buf_delwri_sort( - xfs_buftarg_t *target, - struct list_head *list) -{ - list_sort(NULL, list, xfs_buf_cmp); -} - STATIC int xfsbufd( void *data) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 50a7d5fb3b73..6a83b46b4bcf 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -46,43 +46,46 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ #define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ -#define XBF_ORDERED (1 << 11)/* use ordered writes */ -#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ -#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ + +/* I/O hints for the BIO layer */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_LOCK (1 << 14)/* lock requested */ -#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */ -#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ +#define XBF_LOCK (1 << 15)/* lock requested */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ /* flags used only internally */ -#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ -#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ -#define _XBF_KMEM (1 << 20)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ { XBF_MAPPED, "MAPPED" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ - { XBF_ORDERED, "ORDERED" }, \ - { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_SYNCIO, "SYNCIO" }, \ + { XBF_FUA, "FUA" }, \ + { XBF_FLUSH, "FLUSH" }, \ { XBF_LOCK, "LOCK" }, /* should never be set */\ { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ - { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" } @@ -91,11 +94,6 @@ typedef enum { XBT_FORCE_FLUSH = 1, } xfs_buftarg_flags_t; -typedef struct xfs_bufhash { - struct list_head bh_list; - spinlock_t bh_lock; -} xfs_bufhash_t; - typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; @@ -151,7 +149,7 @@ typedef struct xfs_buf { xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; - void *b_fspriv2; + struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ unsigned long b_queuetime; /* time buffer was queued */ @@ -192,10 +190,11 @@ extern void xfs_buf_free(xfs_buf_t *); extern void xfs_buf_rele(xfs_buf_t *); /* Locking and Unlocking Buffers */ -extern int xfs_buf_cond_lock(xfs_buf_t *); -extern int xfs_buf_lock_value(xfs_buf_t *); +extern int xfs_buf_trylock(xfs_buf_t *); extern void xfs_buf_lock(xfs_buf_t *); extern void xfs_buf_unlock(xfs_buf_t *); +#define xfs_buf_islocked(bp) \ + ((bp)->b_sema.count <= 0) /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); @@ -234,8 +233,9 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ - ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) +#define XFS_BUF_ZEROFLAGS(bp) \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ + XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_STALE(bp) xfs_buf_stale(bp); @@ -267,10 +267,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) -#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED) -#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) -#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) - #define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) @@ -280,14 +276,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) -#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) -#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) - -#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) -#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) -#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) -#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) @@ -313,10 +301,6 @@ xfs_buf_set_ref( #define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) -#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) -#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) -#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) -#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); #define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index f4f878fc0083..75e5d322e48f 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -151,14 +151,14 @@ xfs_nfs_get_inode( * We don't use ESTALE directly down the chain to not * confuse applications using bulkstat that expect EINVAL. */ - if (error == EINVAL) + if (error == EINVAL || error == ENOENT) error = ESTALE; return ERR_PTR(-error); } if (ip->i_d.di_gen != generation) { IRELE(ip); - return ERR_PTR(-ENOENT); + return ERR_PTR(-ESTALE); } return VFS_I(ip); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 7f782af286bf..8073f61efb8e 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -944,7 +944,7 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK); } out_unlock: diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index d44d92cd12b1..501e4f630548 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -39,6 +39,7 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_vnodeops.h" +#include "xfs_inode_item.h" #include "xfs_trace.h" #include @@ -497,12 +498,442 @@ xfs_vn_getattr( return 0; } +int +xfs_setattr_nonsize( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + xfs_trans_t *tp; + int error; + uid_t uid = 0, iuid = 0; + gid_t gid = 0, igid = 0; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = -inode_change_ok(inode, iattr); + if (error) + return XFS_ERROR(error); + + ASSERT((mask & ATTR_SIZE) == 0); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { + uint qflags = 0; + + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = ip->i_d.di_uid; + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = ip->i_d.di_gid; + } + + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), + qflags, &udqp, &gdqp); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) + goto out_dqrele; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = ip->i_d.di_uid; + igid = ip->i_d.di_gid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; + + /* + * Do a quota reservation only if uid/gid is actually + * going to change. + */ + if (XFS_IS_QUOTA_RUNNING(mp) && + ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || + (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { + ASSERT(tp); + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (error) /* out of quota */ + goto out_trans_cancel; + } + } + + xfs_trans_ijoin(tp, ip); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (iuid != uid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & ATTR_UID); + ASSERT(udqp); + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = uid; + inode->i_uid = uid; + } + if (igid != gid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(mask & ATTR_GID); + ASSERT(gdqp); + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = gid; + inode->i_gid = gid; + } + } + + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; + } + + /* + * Change file access or modified times. + */ + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + ip->i_update_core = 1; + } + if (mask & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + ip->i_update_core = 1; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot1); + xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (error) + return XFS_ERROR(error); + + /* + * XXX(hch): Updating the ACL entries is not atomic vs the i_mode + * update. We could avoid this with linked transactions + * and passing down the transaction pointer all the way + * to attr_set. No previous user of the generic + * Posix ACL code seems to care about this issue either. + */ + if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { + error = -xfs_acl_chmod(inode); + if (error) + return XFS_ERROR(error); + } + + return 0; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + return error; +} + +/* + * Truncate file. Must have write permission and not be a directory. + */ +int +xfs_setattr_size( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + struct xfs_trans *tp; + int error; + uint lock_flags; + uint commit_flags = 0; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = -inode_change_ok(inode, iattr); + if (error) + return XFS_ERROR(error); + + ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| + ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + + lock_flags = XFS_ILOCK_EXCL; + if (!(flags & XFS_ATTR_NOLOCK)) + lock_flags |= XFS_IOLOCK_EXCL; + xfs_ilock(ip, lock_flags); + + /* + * Short circuit the truncate case for zero length files. + */ + if (iattr->ia_size == 0 && + ip->i_size == 0 && ip->i_d.di_nextents == 0) { + if (!(mask & (ATTR_CTIME|ATTR_MTIME))) + goto out_unlock; + + /* + * Use the regular setattr path to update the timestamps. + */ + xfs_iunlock(ip, lock_flags); + iattr->ia_valid &= ~ATTR_SIZE; + return xfs_setattr_nonsize(ip, iattr, 0); + } + + /* + * Make sure that the dquots are attached to the inode. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + + /* + * Now we can make the changes. Before we join the inode to the + * transaction, take care of the part of the truncation that must be + * done without the inode lock. This needs to be done before joining + * the inode to the transaction, because the inode cannot be unlocked + * once it is a part of the transaction. + */ + if (iattr->ia_size > ip->i_size) { + /* + * Do the first part of growing a file: zero any data in the + * last block that is beyond the old EOF. We need to do this + * before the inode is joined to the transaction to modify + * i_size. + */ + error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + lock_flags &= ~XFS_ILOCK_EXCL; + + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. + * + * Only flush from the on disk size to the smaller of the in memory + * file size or the new size as that's the range we really care about + * here and prevents waiting for other data not within the range we + * care about here. + */ + if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, + XBF_ASYNC, FI_NONE); + if (error) + goto out_unlock; + } + + /* + * Wait for all I/O to complete. + */ + xfs_ioend_wait(ip); + + error = -block_truncate_page(inode->i_mapping, iattr->ia_size, + xfs_get_blocks); + if (error) + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) + goto out_trans_cancel; + + truncate_setsize(inode, iattr->ia_size); + + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + lock_flags |= XFS_ILOCK_EXCL; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip); + + /* + * Only change the c/mtime if we are changing the size or we are + * explicitly asked to change it. This handles the semantic difference + * between truncate() and ftruncate() as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + mask |= ATTR_CTIME | ATTR_MTIME; + } + + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { + error = xfs_itruncate_data(&tp, ip, iattr->ia_size); + if (error) + goto out_trans_abort; + + /* + * Truncated "down", so we're removing references to old data + * here - if we delay flushing for a long time, we expose + * ourselves unduly to the notorious NULL files problem. So, + * we mark this inode and flush it when the file is closed, + * and do not wait the usual (long) time for writeout. + */ + xfs_iflags_set(ip, XFS_ITRUNCATED); + } + + if (mask & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + ip->i_update_core = 1; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +out_unlock: + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return error; + +out_trans_abort: + commit_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, commit_flags); + goto out_unlock; +} + STATIC int xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); + if (iattr->ia_valid & ATTR_SIZE) + return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0); + return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 8633521b3b2e..d42f814e4d35 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -33,7 +33,6 @@ #endif #include -#include #include #include @@ -88,6 +87,12 @@ #include #include +#ifdef __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif + /* * Feature macros (disable/enable) */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a1a881e68a9a..25fd2cd6c8b0 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -33,7 +33,6 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -1412,37 +1411,35 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; - xfs_inode_shrinker_register(mp); error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; + + error = xfs_syncd_init(mp); + if (error) + goto out_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto fail_unmount; + goto out_syncd_stop; } if (is_bad_inode(root)) { error = EINVAL; - goto fail_vnrele; + goto out_syncd_stop; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { error = ENOMEM; - goto fail_vnrele; + goto out_iput; } return 0; - out_syncd_stop: - xfs_inode_shrinker_unregister(mp); - xfs_syncd_stop(mp); out_filestream_unmount: + xfs_inode_shrinker_unregister(mp); xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1456,17 +1453,12 @@ xfs_fs_fill_super( out: return -error; - fail_vnrele: - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } else { - iput(root); - } - - fail_unmount: - xfs_inode_shrinker_unregister(mp); + out_iput: + iput(root); + out_syncd_stop: xfs_syncd_stop(mp); + out_unmount: + xfs_inode_shrinker_unregister(mp); /* * Blow away any referenced inode in the filestreams cache. diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 8ecad5ff9f9b..5cc158e52d4c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -359,14 +359,12 @@ xfs_quiesce_data( { int error, error2 = 0; - /* push non-blocking */ - xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - - /* push and block till complete */ - xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* force out the newly dirtied log buffers */ + xfs_log_force(mp, XFS_LOG_SYNC); + /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp); @@ -436,7 +434,7 @@ xfs_quiesce_attr( WARN_ON(atomic_read(&mp->m_active_trans) != 0); /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp, 1); + error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index e3a6ad27415f..e914fd621746 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -21,14 +21,6 @@ struct xfs_mount; struct xfs_perag; -typedef struct xfs_sync_work { - struct list_head w_list; - struct xfs_mount *w_mount; - void *w_data; /* syncer routine argument */ - void (*w_syncer)(struct xfs_mount *, void *); - struct completion *w_completion; -} xfs_sync_work_t; - #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index d48b7a579ae1..fda0708ef2ea 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -293,7 +293,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->buffer_length = bp->b_buffer_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); - __entry->lockval = xfs_buf_lock_value(bp); + __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), @@ -323,7 +323,7 @@ DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_bdwrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); -DEFINE_BUF_EVENT(xfs_buf_cond_lock); +DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); @@ -366,7 +366,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); - __entry->lockval = xfs_buf_lock_value(bp); + __entry->lockval = bp->b_sema.count; __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " @@ -409,7 +409,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->buffer_length = bp->b_buffer_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); - __entry->lockval = xfs_buf_lock_value(bp); + __entry->lockval = bp->b_sema.count; __entry->error = error; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; @@ -454,7 +454,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); - __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); + __entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->li_desc = bip->bli_item.li_desc; __entry->li_flags = bip->bli_item.li_flags; ), @@ -998,7 +998,8 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(loff_t, size) + __field(loff_t, isize) + __field(loff_t, disize) __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) @@ -1006,16 +1007,18 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; + __entry->isize = ip->i_size; + __entry->disize = ip->i_d.di_size; __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->size, + __entry->isize, + __entry->disize, __entry->new_size, __entry->offset, __entry->count) @@ -1028,40 +1031,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); - - -TRACE_EVENT(xfs_itruncate_start, - TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag, - xfs_off_t toss_start, xfs_off_t toss_finish), - TP_ARGS(ip, new_size, flag, toss_start, toss_finish), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsize_t, size) - __field(xfs_fsize_t, new_size) - __field(xfs_off_t, toss_start) - __field(xfs_off_t, toss_finish) - __field(int, flag) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; - __entry->new_size = new_size; - __entry->toss_start = toss_start; - __entry->toss_finish = toss_finish; - __entry->flag = flag; - ), - TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx " - "toss start 0x%llx toss finish 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS), - __entry->size, - __entry->new_size, - __entry->toss_start, - __entry->toss_finish) -); +DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -1089,8 +1059,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 6fa214603819..837f31158d43 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -220,7 +220,7 @@ xfs_qm_adjust_dqtimers( { ASSERT(d->d_id); -#ifdef QUOTADEBUG +#ifdef DEBUG if (d->d_blk_hardlimit) ASSERT(be64_to_cpu(d->d_blk_softlimit) <= be64_to_cpu(d->d_blk_hardlimit)); @@ -231,6 +231,7 @@ xfs_qm_adjust_dqtimers( ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= be64_to_cpu(d->d_rtb_hardlimit)); #endif + if (!d->d_btimer) { if ((d->d_blk_softlimit && (be64_to_cpu(d->d_bcount) >= @@ -318,7 +319,7 @@ xfs_qm_init_dquot_blk( ASSERT(tp); ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); @@ -534,7 +535,7 @@ xfs_qm_dqtobp( } ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); /* * calculate the location of the dquot inside the buffer. @@ -622,7 +623,7 @@ xfs_qm_dqread( * brelse it because we have the changes incore. */ ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); return (error); @@ -1423,45 +1424,6 @@ xfs_qm_dqpurge( } -#ifdef QUOTADEBUG -void -xfs_qm_dqprint(xfs_dquot_t *dqp) -{ - struct xfs_mount *mp = dqp->q_mount; - - xfs_debug(mp, "-----------KERNEL DQUOT----------------"); - xfs_debug(mp, "---- dquotID = %d", - (int)be32_to_cpu(dqp->q_core.d_id)); - xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); - xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount); - xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno); - xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset); - xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_blk_hardlimit), - (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - xfs_debug(mp, "---- blkslimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_blk_softlimit), - (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); - xfs_debug(mp, "---- inohlimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_ino_hardlimit), - (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); - xfs_debug(mp, "---- inoslimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_ino_softlimit), - (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); - xfs_debug(mp, "---- bcount = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_bcount), - (int)be64_to_cpu(dqp->q_core.d_bcount)); - xfs_debug(mp, "---- icount = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_icount), - (int)be64_to_cpu(dqp->q_core.d_icount)); - xfs_debug(mp, "---- btimer = %d", - (int)be32_to_cpu(dqp->q_core.d_btimer)); - xfs_debug(mp, "---- itimer = %d", - (int)be32_to_cpu(dqp->q_core.d_itimer)); - xfs_debug(mp, "---------------------------"); -} -#endif - /* * Give the buffer a little push if it is incore and * wait on the flush lock. diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 5da3a23b820d..34b7e945dbfa 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -116,12 +116,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ (XFS_IS_OQUOTA_ON((d)->q_mount)))) -#ifdef QUOTADEBUG -extern void xfs_qm_dqprint(xfs_dquot_t *); -#else -#define xfs_qm_dqprint(a) -#endif - extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); extern int xfs_qm_dqpurge(xfs_dquot_t *); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index b94dace4e785..46e54ad9a2dc 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -67,32 +67,6 @@ static struct shrinker xfs_qm_shaker = { .seeks = DEFAULT_SEEKS, }; -#ifdef DEBUG -extern struct mutex qcheck_lock; -#endif - -#ifdef QUOTADEBUG -static void -xfs_qm_dquot_list_print( - struct xfs_mount *mp) -{ - xfs_dquot_t *dqp; - int i = 0; - - list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { - xfs_debug(mp, " %d. \"%d (%s)\" " - "bcnt = %lld, icnt = %lld, refs = %d", - i++, be32_to_cpu(dqp->q_core.d_id), - DQFLAGTO_TYPESTR(dqp), - (long long)be64_to_cpu(dqp->q_core.d_bcount), - (long long)be64_to_cpu(dqp->q_core.d_icount), - dqp->q_nrefs); - } -} -#else -static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } -#endif - /* * Initialize the XQM structure. * Note that there is not one quota manager per file system. @@ -165,9 +139,6 @@ xfs_Gqm_init(void) atomic_set(&xqm->qm_totaldquots, 0); xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; xqm->qm_nrefs = 0; -#ifdef DEBUG - mutex_init(&qcheck_lock); -#endif return xqm; out_free_udqhash: @@ -204,9 +175,6 @@ xfs_qm_destroy( mutex_lock(&xqm->qm_dqfrlist_lock); list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { xfs_dqlock(dqp); -#ifdef QUOTADEBUG - xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp); -#endif list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; xfs_dqunlock(dqp); @@ -214,9 +182,6 @@ xfs_qm_destroy( } mutex_unlock(&xqm->qm_dqfrlist_lock); mutex_destroy(&xqm->qm_dqfrlist_lock); -#ifdef DEBUG - mutex_destroy(&qcheck_lock); -#endif kmem_free(xqm); } @@ -409,11 +374,6 @@ xfs_qm_mount_quotas( xfs_warn(mp, "Failed to initialize disk quotas."); return; } - -#ifdef QUOTADEBUG - if (XFS_IS_QUOTA_ON(mp)) - xfs_qm_internalqcheck(mp); -#endif } /* @@ -866,8 +826,8 @@ xfs_qm_dqattach_locked( } done: -#ifdef QUOTADEBUG - if (! error) { +#ifdef DEBUG + if (!error) { if (XFS_IS_UQUOTA_ON(mp)) ASSERT(ip->i_udquot); if (XFS_IS_OQUOTA_ON(mp)) @@ -1733,8 +1693,6 @@ xfs_qm_quotacheck( mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; - xfs_qm_dquot_list_print(mp); - error_return: if (error) { xfs_warn(mp, @@ -2096,9 +2054,6 @@ xfs_qm_write_sb_changes( xfs_trans_t *tp; int error; -#ifdef QUOTADEBUG - xfs_notice(mp, "Writing superblock quota changes"); -#endif tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 567b29b9f1b3..43b9abe1052c 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -163,10 +163,4 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); -#ifdef DEBUG -extern int xfs_qm_internalqcheck(xfs_mount_t *); -#else -#define xfs_qm_internalqcheck(mp) (0) -#endif - #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 2dadb15d5ca9..609246f42e6c 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -263,7 +263,7 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip); - error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1); + error = xfs_itruncate_data(&tp, ip, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -622,7 +622,6 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); out_unlock: @@ -657,7 +656,6 @@ xfs_qm_scall_getquota( xfs_qm_dqput(dqp); return XFS_ERROR(ENOENT); } - /* xfs_qm_dqprint(dqp); */ /* * Convert the disk dquot to the exportable format */ @@ -906,354 +904,3 @@ xfs_qm_dqrele_all_inodes( ASSERT(mp->m_quotainfo); xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); } - -/*------------------------------------------------------------------------*/ -#ifdef DEBUG -/* - * This contains all the test functions for XFS disk quotas. - * Currently it does a quota accounting check. ie. it walks through - * all inodes in the file system, calculating the dquot accounting fields, - * and prints out any inconsistencies. - */ -xfs_dqhash_t *qmtest_udqtab; -xfs_dqhash_t *qmtest_gdqtab; -int qmtest_hashmask; -int qmtest_nfails; -struct mutex qcheck_lock; - -#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ - (__psunsigned_t)(id)) & \ - (qmtest_hashmask - 1)) - -#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \ - (qmtest_udqtab + \ - DQTEST_HASHVAL(mp, id)) : \ - (qmtest_gdqtab + \ - DQTEST_HASHVAL(mp, id))) - -#define DQTEST_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dqtest_t *dqp; int i = 0;\ - xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ - dqp = (xfs_dqtest_t *)dqp->NXT) { \ - xfs_debug(dqp->q_mount, \ - " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ - ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ - dqp->d_bcount, dqp->d_icount); } \ -} - -typedef struct dqtest { - uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_hashlist; - xfs_dqhash_t *q_hash; /* the hashchain header */ - xfs_mount_t *q_mount; /* filesystem this relates to */ - xfs_dqid_t d_id; /* user id or group id */ - xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */ - xfs_qcnt_t d_icount; /* # inodes owned by the user */ -} xfs_dqtest_t; - -STATIC void -xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) -{ - list_add(&dqp->q_hashlist, &h->qh_list); - h->qh_version++; - h->qh_nelems++; -} -STATIC void -xfs_qm_dqtest_print( - struct xfs_mount *mp, - struct dqtest *d) -{ - xfs_debug(mp, "-----------DQTEST DQUOT----------------"); - xfs_debug(mp, "---- dquot ID = %d", d->d_id); - xfs_debug(mp, "---- fs = 0x%p", d->q_mount); - xfs_debug(mp, "---- bcount = %Lu (0x%x)", - d->d_bcount, (int)d->d_bcount); - xfs_debug(mp, "---- icount = %Lu (0x%x)", - d->d_icount, (int)d->d_icount); - xfs_debug(mp, "---------------------------"); -} - -STATIC void -xfs_qm_dqtest_failed( - xfs_dqtest_t *d, - xfs_dquot_t *dqp, - char *reason, - xfs_qcnt_t a, - xfs_qcnt_t b, - int error) -{ - qmtest_nfails++; - if (error) - xfs_debug(dqp->q_mount, - "quotacheck failed id=%d, err=%d\nreason: %s", - d->d_id, error, reason); - else - xfs_debug(dqp->q_mount, - "quotacheck failed id=%d (%s) [%d != %d]", - d->d_id, reason, (int)a, (int)b); - xfs_qm_dqtest_print(dqp->q_mount, d); - if (dqp) - xfs_qm_dqprint(dqp); -} - -STATIC int -xfs_dqtest_cmp2( - xfs_dqtest_t *d, - xfs_dquot_t *dqp) -{ - int err = 0; - if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) { - xfs_qm_dqtest_failed(d, dqp, "icount mismatch", - be64_to_cpu(dqp->q_core.d_icount), - d->d_icount, 0); - err++; - } - if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) { - xfs_qm_dqtest_failed(d, dqp, "bcount mismatch", - be64_to_cpu(dqp->q_core.d_bcount), - d->d_bcount, 0); - err++; - } - if (dqp->q_core.d_blk_softlimit && - be64_to_cpu(dqp->q_core.d_bcount) >= - be64_to_cpu(dqp->q_core.d_blk_softlimit)) { - if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { - xfs_debug(dqp->q_mount, - "%d [%s] BLK TIMER NOT STARTED", - d->d_id, DQFLAGTO_TYPESTR(d)); - err++; - } - } - if (dqp->q_core.d_ino_softlimit && - be64_to_cpu(dqp->q_core.d_icount) >= - be64_to_cpu(dqp->q_core.d_ino_softlimit)) { - if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { - xfs_debug(dqp->q_mount, - "%d [%s] INO TIMER NOT STARTED", - d->d_id, DQFLAGTO_TYPESTR(d)); - err++; - } - } -#ifdef QUOTADEBUG - if (!err) { - xfs_debug(dqp->q_mount, "%d [%s] qchecked", - d->d_id, DQFLAGTO_TYPESTR(d)); - } -#endif - return (err); -} - -STATIC void -xfs_dqtest_cmp( - xfs_dqtest_t *d) -{ - xfs_dquot_t *dqp; - int error; - - /* xfs_qm_dqtest_print(d); */ - if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0, - &dqp))) { - xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error); - return; - } - xfs_dqtest_cmp2(d, dqp); - xfs_qm_dqput(dqp); -} - -STATIC int -xfs_qm_internalqcheck_dqget( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_dqtest_t **O_dq) -{ - xfs_dqtest_t *d; - xfs_dqhash_t *h; - - h = DQTEST_HASH(mp, id, type); - list_for_each_entry(d, &h->qh_list, q_hashlist) { - if (d->d_id == id && mp == d->q_mount) { - *O_dq = d; - return (0); - } - } - d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP); - d->dq_flags = type; - d->d_id = id; - d->q_mount = mp; - d->q_hash = h; - INIT_LIST_HEAD(&d->q_hashlist); - xfs_qm_hashinsert(h, d); - *O_dq = d; - return (0); -} - -STATIC void -xfs_qm_internalqcheck_get_dquots( - xfs_mount_t *mp, - xfs_dqid_t uid, - xfs_dqid_t projid, - xfs_dqid_t gid, - xfs_dqtest_t **ud, - xfs_dqtest_t **gd) -{ - if (XFS_IS_UQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud); - if (XFS_IS_GQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd); - else if (XFS_IS_PQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd); -} - - -STATIC void -xfs_qm_internalqcheck_dqadjust( - xfs_inode_t *ip, - xfs_dqtest_t *d) -{ - d->d_icount++; - d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks; -} - -STATIC int -xfs_qm_internalqcheck_adjust( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* not used */ - int ubsize, /* not used */ - int *ubused, /* not used */ - int *res) /* bulkstat result code */ -{ - xfs_inode_t *ip; - xfs_dqtest_t *ud, *gd; - uint lock_flags; - boolean_t ipreleased; - int error; - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { - *res = BULKSTAT_RV_NOTHING; - xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n", - __func__, (unsigned long long) ino, - (unsigned long long) mp->m_sb.sb_uquotino, - (unsigned long long) mp->m_sb.sb_gquotino); - return XFS_ERROR(EINVAL); - } - ipreleased = B_FALSE; - again: - lock_flags = XFS_ILOCK_SHARED; - if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) { - *res = BULKSTAT_RV_NOTHING; - return (error); - } - - /* - * This inode can have blocks after eof which can get released - * when we send it to inactive. Since we don't check the dquot - * until the after all our calculations are done, we must get rid - * of those now. - */ - if (! ipreleased) { - xfs_iunlock(ip, lock_flags); - IRELE(ip); - ipreleased = B_TRUE; - goto again; - } - xfs_qm_internalqcheck_get_dquots(mp, - (xfs_dqid_t) ip->i_d.di_uid, - (xfs_dqid_t) xfs_get_projid(ip), - (xfs_dqid_t) ip->i_d.di_gid, - &ud, &gd); - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(ud); - xfs_qm_internalqcheck_dqadjust(ip, ud); - } - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(gd); - xfs_qm_internalqcheck_dqadjust(ip, gd); - } - xfs_iunlock(ip, lock_flags); - IRELE(ip); - *res = BULKSTAT_RV_DIDONE; - return (0); -} - - -/* PRIVATE, debugging */ -int -xfs_qm_internalqcheck( - xfs_mount_t *mp) -{ - xfs_ino_t lastino; - int done, count; - int i; - int error; - - lastino = 0; - qmtest_hashmask = 32; - count = 5; - done = 0; - qmtest_nfails = 0; - - if (! XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(ESRCH); - - xfs_log_force(mp, XFS_LOG_SYNC); - XFS_bflush(mp->m_ddev_targp); - xfs_log_force(mp, XFS_LOG_SYNC); - XFS_bflush(mp->m_ddev_targp); - - mutex_lock(&qcheck_lock); - /* There should be absolutely no quota activity while this - is going on. */ - qmtest_udqtab = kmem_zalloc(qmtest_hashmask * - sizeof(xfs_dqhash_t), KM_SLEEP); - qmtest_gdqtab = kmem_zalloc(qmtest_hashmask * - sizeof(xfs_dqhash_t), KM_SLEEP); - do { - /* - * Iterate thru all the inodes in the file system, - * adjusting the corresponding dquot counters - */ - error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_internalqcheck_adjust, - 0, NULL, &done); - if (error) { - xfs_debug(mp, "Bulkstat returned error 0x%x", error); - break; - } - } while (!done); - - xfs_debug(mp, "Checking results against system dquots"); - for (i = 0; i < qmtest_hashmask; i++) { - xfs_dqtest_t *d, *n; - xfs_dqhash_t *h; - - h = &qmtest_udqtab[i]; - list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { - xfs_dqtest_cmp(d); - kmem_free(d); - } - h = &qmtest_gdqtab[i]; - list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { - xfs_dqtest_cmp(d); - kmem_free(d); - } - } - - if (qmtest_nfails) { - xfs_debug(mp, "******** quotacheck failed ********"); - xfs_debug(mp, "failures = %d", qmtest_nfails); - } else { - xfs_debug(mp, "******** quotacheck successful! ********"); - } - kmem_free(qmtest_udqtab); - kmem_free(qmtest_gdqtab); - mutex_unlock(&qcheck_lock); - return (qmtest_nfails); -} - -#endif /* DEBUG */ diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 2a3648731331..4d00ee67792d 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -59,7 +59,7 @@ xfs_trans_dqjoin( xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); /* - * Initialize i_transp so we can later determine if this dquot is + * Initialize d_transp so we can later determine if this dquot is * associated with this transaction. */ dqp->q_transp = tp; @@ -387,18 +387,18 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delbcnt_delta; totalrtbdelta = qtrx->qt_rtbcount_delta + qtrx->qt_delrtb_delta; -#ifdef QUOTADEBUG +#ifdef DEBUG if (totalbdelta < 0) ASSERT(be64_to_cpu(d->d_bcount) >= - (xfs_qcnt_t) -totalbdelta); + -totalbdelta); if (totalrtbdelta < 0) ASSERT(be64_to_cpu(d->d_rtbcount) >= - (xfs_qcnt_t) -totalrtbdelta); + -totalrtbdelta); if (qtrx->qt_icount_delta < 0) ASSERT(be64_to_cpu(d->d_icount) >= - (xfs_qcnt_t) -qtrx->qt_icount_delta); + -qtrx->qt_icount_delta); #endif if (totalbdelta) be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); @@ -642,11 +642,6 @@ xfs_trans_dqresv( ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { -#ifdef QUOTADEBUG - xfs_debug(mp, - "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?", - nblks, *resbcountp, hardlimit); -#endif if (nblks > 0) { /* * dquot is locked already. See if we'd go over the diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 5ad8ad3a1dcd..53ec3ea9a625 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -22,7 +22,6 @@ #define STATIC #define DEBUG 1 #define XFS_BUF_LOCK_TRACKING 1 -/* #define QUOTADEBUG 1 */ #endif #include diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 95862bbff56b..1e00b3ef6274 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -570,9 +570,7 @@ xfs_alloc_ag_vextent_exact( xfs_agblock_t tbno; /* start block of trimmed extent */ xfs_extlen_t tlen; /* length of trimmed extent */ xfs_agblock_t tend; /* end block of trimmed extent */ - xfs_agblock_t end; /* end of allocated extent */ int i; /* success/failure of operation */ - xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); @@ -625,18 +623,16 @@ xfs_alloc_ag_vextent_exact( * * Fix the length according to mod and prod if given. */ - end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen); - args->len = end - args->agbno; + args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) + - args->agbno; xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) goto not_found; - rlen = args->len; - ASSERT(args->agbno + rlen <= tend); - end = args->agbno + rlen; + ASSERT(args->agbno + args->len <= tend); /* - * We are allocating agbno for rlen [agbno .. end] + * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, @@ -2127,7 +2123,7 @@ xfs_read_agf( * Validate the magic number of the agf block. */ agf_ok = - be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC && + agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 2b3518826a69..ffb3386e45c1 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -31,7 +31,6 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -311,72 +310,6 @@ xfs_allocbt_recs_inorder( } #endif /* DEBUG */ -#ifdef XFS_BTREE_TRACE -ktrace_t *xfs_allocbt_trace_buf; - -STATIC void -xfs_allocbt_trace_enter( - struct xfs_btree_cur *cur, - const char *func, - char *s, - int type, - int line, - __psunsigned_t a0, - __psunsigned_t a1, - __psunsigned_t a2, - __psunsigned_t a3, - __psunsigned_t a4, - __psunsigned_t a5, - __psunsigned_t a6, - __psunsigned_t a7, - __psunsigned_t a8, - __psunsigned_t a9, - __psunsigned_t a10) -{ - ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type, - (void *)func, (void *)s, NULL, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); -} - -STATIC void -xfs_allocbt_trace_cursor( - struct xfs_btree_cur *cur, - __uint32_t *s0, - __uint64_t *l0, - __uint64_t *l1) -{ - *s0 = cur->bc_private.a.agno; - *l0 = cur->bc_rec.a.ar_startblock; - *l1 = cur->bc_rec.a.ar_blockcount; -} - -STATIC void -xfs_allocbt_trace_key( - struct xfs_btree_cur *cur, - union xfs_btree_key *key, - __uint64_t *l0, - __uint64_t *l1) -{ - *l0 = be32_to_cpu(key->alloc.ar_startblock); - *l1 = be32_to_cpu(key->alloc.ar_blockcount); -} - -STATIC void -xfs_allocbt_trace_record( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - __uint64_t *l0, - __uint64_t *l1, - __uint64_t *l2) -{ - *l0 = be32_to_cpu(rec->alloc.ar_startblock); - *l1 = be32_to_cpu(rec->alloc.ar_blockcount); - *l2 = 0; -} -#endif /* XFS_BTREE_TRACE */ - static const struct xfs_btree_ops xfs_allocbt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -393,18 +326,10 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, - #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, #endif - -#ifdef XFS_BTREE_TRACE - .trace_enter = xfs_allocbt_trace_enter, - .trace_cursor = xfs_allocbt_trace_cursor, - .trace_key = xfs_allocbt_trace_key, - .trace_record = xfs_allocbt_trace_record, -#endif }; /* @@ -427,13 +352,16 @@ xfs_allocbt_init_cursor( cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]); cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_allocbt_ops; - if (btnum == XFS_BTNUM_CNT) + + if (btnum == XFS_BTNUM_CNT) { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + } else { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + } cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h deleted file mode 100644 index 0902249354a0..000000000000 --- a/fs/xfs/xfs_arch.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_ARCH_H__ -#define __XFS_ARCH_H__ - -#ifndef XFS_BIG_INUMS -# error XFS_BIG_INUMS must be defined true or false -#endif - -#ifdef __KERNEL__ - -#include - -#ifdef __BIG_ENDIAN -#define XFS_NATIVE_HOST 1 -#else -#undef XFS_NATIVE_HOST -#endif - -#else /* __KERNEL__ */ - -#if __BYTE_ORDER == __BIG_ENDIAN -#define XFS_NATIVE_HOST 1 -#else -#undef XFS_NATIVE_HOST -#endif - -#ifdef XFS_NATIVE_HOST -#define cpu_to_be16(val) ((__force __be16)(__u16)(val)) -#define cpu_to_be32(val) ((__force __be32)(__u32)(val)) -#define cpu_to_be64(val) ((__force __be64)(__u64)(val)) -#define be16_to_cpu(val) ((__force __u16)(__be16)(val)) -#define be32_to_cpu(val) ((__force __u32)(__be32)(val)) -#define be64_to_cpu(val) ((__force __u64)(__be64)(val)) -#else -#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val))) -#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val))) -#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val))) -#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val))) -#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val))) -#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val))) -#endif - -static inline void be16_add_cpu(__be16 *a, __s16 b) -{ - *a = cpu_to_be16(be16_to_cpu(*a) + b); -} - -static inline void be32_add_cpu(__be32 *a, __s32 b) -{ - *a = cpu_to_be32(be32_to_cpu(*a) + b); -} - -static inline void be64_add_cpu(__be64 *a, __s64 b) -{ - *a = cpu_to_be64(be64_to_cpu(*a) + b); -} - -#endif /* __KERNEL__ */ - -/* - * get and set integers from potentially unaligned locations - */ - -#define INT_GET_UNALIGNED_16_BE(pointer) \ - ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1]))) -#define INT_SET_UNALIGNED_16_BE(pointer,value) \ - { \ - ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \ - ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ - } - -/* - * In directories inode numbers are stored as unaligned arrays of unsigned - * 8bit integers on disk. - * - * For v1 directories or v2 directories that contain inode numbers that - * do not fit into 32bit the array has eight members, but the first member - * is always zero: - * - * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7| - * - * For v2 directories that only contain entries with inode numbers that fit - * into 32bits a four-member array is used: - * - * |24-31|16-23| 8-15| 0- 7| - */ - -#define XFS_GET_DIR_INO4(di) \ - (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) - -#define XFS_PUT_DIR_INO4(from, di) \ -do { \ - (di).i[0] = (((from) & 0xff000000ULL) >> 24); \ - (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \ - (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \ - (di).i[3] = ((from) & 0x000000ffULL); \ -} while (0) - -#define XFS_DI_HI(di) \ - (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) -#define XFS_DI_LO(di) \ - (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) - -#define XFS_GET_DIR_INO8(di) \ - (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ - ((xfs_ino_t)XFS_DI_HI(di) << 32)) - -#define XFS_PUT_DIR_INO8(from, di) \ -do { \ - (di).i[0] = 0; \ - (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \ - (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \ - (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \ - (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \ - (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \ - (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \ - (di).i[7] = ((from) & 0x00000000000000ffULL); \ -} while (0) - -#endif /* __XFS_ARCH_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01d2072fb6d4..cbae424fe1ba 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -822,17 +822,21 @@ xfs_attr_inactive(xfs_inode_t *dp) error = xfs_attr_root_inactive(&trans, dp); if (error) goto out; + /* - * signal synchronous inactive transactions unless this - * is a synchronous mount filesystem in which case we - * know that we're here because we've been called out of - * xfs_inactive which means that the last reference is gone - * and the unlink transaction has already hit the disk so - * async inactive transactions are safe. + * Signal synchronous inactive transactions unless this is a + * synchronous mount filesystem in which case we know that we're here + * because we've been called out of xfs_inactive which means that the + * last reference is gone and the unlink transaction has already hit + * the disk so async inactive transactions are safe. */ - if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, - (!(mp->m_flags & XFS_MOUNT_WSYNC) - ? 1 : 0)))) + if (!(mp->m_flags & XFS_MOUNT_WSYNC)) { + if (dp->i_d.di_anextents > 0) + xfs_trans_set_sync(trans); + } + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) goto out; /* @@ -1199,7 +1203,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) return XFS_ERROR(error); ASSERT(bp != NULL); leaf = bp->data; - if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) { + if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); xfs_da_brelse(NULL, bp); @@ -1606,9 +1610,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - ASSERT(be16_to_cpu(((xfs_attr_leafblock_t *) - bp->data)->hdr.info.magic) - == XFS_ATTR_LEAF_MAGIC); + ASSERT((((xfs_attr_leafblock_t *)bp->data)->hdr.info.magic) == + cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1873,11 +1876,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return(XFS_ERROR(EFSCORRUPTED)); } node = bp->data; - if (be16_to_cpu(node->hdr.info.magic) - == XFS_ATTR_LEAF_MAGIC) + if (node->hdr.info.magic == + cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) break; - if (unlikely(be16_to_cpu(node->hdr.info.magic) - != XFS_DA_NODE_MAGIC)) { + if (unlikely(node->hdr.info.magic != + cpu_to_be16(XFS_DA_NODE_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", XFS_ERRLEVEL_LOW, context->dp->i_mount, @@ -1912,8 +1915,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->data; - if (unlikely(be16_to_cpu(leaf->hdr.info.magic) - != XFS_ATTR_LEAF_MAGIC)) { + if (unlikely(leaf->hdr.info.magic != + cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 71e90dc2aeb1..8fad9602542b 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -731,7 +731,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) int bytes, i; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); @@ -777,7 +777,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) ASSERT(bp != NULL); memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); /* @@ -872,7 +872,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) goto out; node = bp1->data; leaf = bp2->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* both on-disk, don't endian-flip twice */ node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; @@ -997,7 +997,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) int tablesize, entsize, sum, tmp, i; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(leaf->hdr.count))); hdr = &leaf->hdr; @@ -1070,7 +1070,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) int tmp, i; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); @@ -1256,8 +1256,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->data; leaf2 = blk2->bp->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); args = state->args; /* @@ -1533,7 +1533,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = sizeof(xfs_attr_leaf_hdr_t) + @@ -1596,7 +1596,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); @@ -1650,7 +1650,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_mount_t *mp; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; mp = args->trans->t_mountp; ASSERT((be16_to_cpu(hdr->count) > 0) @@ -1813,8 +1813,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); drop_leaf = drop_blk->bp->data; save_leaf = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); drop_hdr = &drop_leaf->hdr; save_hdr = &save_leaf->hdr; @@ -1915,7 +1915,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_dahash_t hashval; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2019,7 +2019,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_name_remote_t *name_rmt; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); @@ -2087,8 +2087,8 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr_s = &leaf_s->hdr; hdr_d = &leaf_d->hdr; ASSERT((be16_to_cpu(hdr_s->count) > 0) && @@ -2222,8 +2222,8 @@ xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) leaf1 = leaf1_bp->data; leaf2 = leaf2_bp->data; - ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC) && - (be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC)); + ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && + (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); if ((be16_to_cpu(leaf1->hdr.count) > 0) && (be16_to_cpu(leaf2->hdr.count) > 0) && ((be32_to_cpu(leaf2->entries[0].hashval) < @@ -2246,7 +2246,7 @@ xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) xfs_attr_leafblock_t *leaf; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); if (!leaf->hdr.count) @@ -2265,7 +2265,7 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, @@ -2451,7 +2451,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) ASSERT(bp != NULL); leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2515,7 +2515,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) ASSERT(bp != NULL); leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2585,13 +2585,13 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) } leaf1 = bp1->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; leaf2 = bp2->data; - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; @@ -2689,9 +2689,9 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * This is a depth-first traversal! */ info = bp->data; - if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { + if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { + } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { error = xfs_attr_leaf_inactive(trans, dp, bp); } else { error = XFS_ERROR(EIO); @@ -2739,7 +2739,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, } node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ count = be16_to_cpu(node->hdr.count); if (!count) { @@ -2773,10 +2773,10 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * Invalidate the subtree, however we have to. */ info = child_bp->data; - if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { + if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, child_bp, level+1); - } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { + } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { error = xfs_attr_leaf_inactive(trans, dp, child_bp); } else { @@ -2836,7 +2836,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) int error, count, size, tmp, i; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* * Count the number of "remote" value extents. diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e546a33214c9..c51a3f903633 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -29,15 +29,11 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_mount.h" #include "xfs_itable.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" #include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" @@ -94,6 +90,7 @@ xfs_bmap_add_attrfork_local( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( + struct xfs_trans *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ @@ -439,6 +436,7 @@ xfs_bmap_add_attrfork_local( */ STATIC int /* error */ xfs_bmap_add_extent( + struct xfs_trans *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ @@ -524,7 +522,7 @@ xfs_bmap_add_extent( if (cur) ASSERT(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(ip, + error = xfs_bmap_add_extent_delay_real(tp, ip, idx, &cur, new, &da_new, first, flist, &logflags); } else { @@ -561,7 +559,7 @@ xfs_bmap_add_extent( int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first, + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, da_old > 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -604,6 +602,7 @@ done: */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( + struct xfs_trans *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ @@ -901,7 +900,7 @@ xfs_bmap_add_extent_delay_real( } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; @@ -984,7 +983,7 @@ xfs_bmap_add_extent_delay_real( } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; @@ -1052,7 +1051,7 @@ xfs_bmap_add_extent_delay_real( } if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; @@ -2871,8 +2870,8 @@ xfs_bmap_del_extent( len = del->br_blockcount; do_div(bno, mp->m_sb.sb_rextsize); do_div(len, mp->m_sb.sb_rextsize); - if ((error = xfs_rtfree_extent(ip->i_transp, bno, - (xfs_extlen_t)len))) + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) goto done; do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; @@ -4080,7 +4079,7 @@ xfs_bmap_sanity_check( { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC || + if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || be16_to_cpu(block->bb_level) != level || be16_to_cpu(block->bb_numrecs) == 0 || be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) @@ -4662,7 +4661,7 @@ xfs_bmapi( if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) got.br_state = XFS_EXT_UNWRITTEN; } - error = xfs_bmap_add_extent(ip, &lastx, &cur, &got, + error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got, firstblock, flist, &tmp_logflags, whichfork); logflags |= tmp_logflags; @@ -4763,7 +4762,7 @@ xfs_bmapi( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, mval, + error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval, firstblock, flist, &tmp_logflags, whichfork); logflags |= tmp_logflags; @@ -5117,7 +5116,7 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, &del, + error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, firstblock, flist, &logflags, XFS_DATA_FORK); if (error) @@ -5175,18 +5174,18 @@ xfs_bunmapi( } prev.br_state = XFS_EXT_UNWRITTEN; lastx--; - error = xfs_bmap_add_extent(ip, &lastx, &cur, - &prev, firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent(tp, ip, &lastx, + &cur, &prev, firstblock, flist, + &logflags, XFS_DATA_FORK); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, &lastx, &cur, - &del, firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent(tp, ip, &lastx, + &cur, &del, firstblock, flist, + &logflags, XFS_DATA_FORK); if (error) goto error0; goto nodelete; diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 87d3c10b6954..e2f5d59cbeaf 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -33,7 +33,6 @@ #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -425,10 +424,10 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); - ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO); - ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO); - ASSERT(be16_to_cpu(rblock->bb_level) > 0); + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); + ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); @@ -732,95 +731,6 @@ xfs_bmbt_recs_inorder( } #endif /* DEBUG */ -#ifdef XFS_BTREE_TRACE -ktrace_t *xfs_bmbt_trace_buf; - -STATIC void -xfs_bmbt_trace_enter( - struct xfs_btree_cur *cur, - const char *func, - char *s, - int type, - int line, - __psunsigned_t a0, - __psunsigned_t a1, - __psunsigned_t a2, - __psunsigned_t a3, - __psunsigned_t a4, - __psunsigned_t a5, - __psunsigned_t a6, - __psunsigned_t a7, - __psunsigned_t a8, - __psunsigned_t a9, - __psunsigned_t a10) -{ - struct xfs_inode *ip = cur->bc_private.b.ip; - int whichfork = cur->bc_private.b.whichfork; - - ktrace_enter(xfs_bmbt_trace_buf, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); -} - -STATIC void -xfs_bmbt_trace_cursor( - struct xfs_btree_cur *cur, - __uint32_t *s0, - __uint64_t *l0, - __uint64_t *l1) -{ - struct xfs_bmbt_rec_host r; - - xfs_bmbt_set_all(&r, &cur->bc_rec.b); - - *s0 = (cur->bc_nlevels << 24) | - (cur->bc_private.b.flags << 16) | - cur->bc_private.b.allocated; - *l0 = r.l0; - *l1 = r.l1; -} - -STATIC void -xfs_bmbt_trace_key( - struct xfs_btree_cur *cur, - union xfs_btree_key *key, - __uint64_t *l0, - __uint64_t *l1) -{ - *l0 = be64_to_cpu(key->bmbt.br_startoff); - *l1 = 0; -} - -/* Endian flipping versions of the bmbt extraction functions */ -STATIC void -xfs_bmbt_disk_get_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), - get_unaligned_be64(&r->l1), s); -} - -STATIC void -xfs_bmbt_trace_record( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - __uint64_t *l0, - __uint64_t *l1, - __uint64_t *l2) -{ - struct xfs_bmbt_irec irec; - - xfs_bmbt_disk_get_all(&rec->bmbt, &irec); - *l0 = irec.br_startoff; - *l1 = irec.br_startblock; - *l2 = irec.br_blockcount; -} -#endif /* XFS_BTREE_TRACE */ - static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), @@ -837,18 +747,10 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, - #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, #endif - -#ifdef XFS_BTREE_TRACE - .trace_enter = xfs_bmbt_trace_enter, - .trace_cursor = xfs_bmbt_trace_cursor, - .trace_key = xfs_bmbt_trace_key, - .trace_record = xfs_bmbt_trace_record, -#endif }; /* diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2f9e97c128a0..cabf4b5604aa 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -32,7 +32,6 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -66,11 +65,11 @@ xfs_btree_check_lblock( be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && - (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO || + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && - (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO || + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))); if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, @@ -105,10 +104,10 @@ xfs_btree_check_sblock( be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && - (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK || + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && block->bb_u.s.bb_leftsib && - (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK || + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, @@ -511,9 +510,9 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); xfs_btree_check_block(cur, block, level, bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO; + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO); else - return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK; + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } /* @@ -777,14 +776,14 @@ xfs_btree_setbuf( b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) + if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; - if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO) + if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)) cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } else { - if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK) + if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; - if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK) + if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } } @@ -795,9 +794,9 @@ xfs_btree_ptr_is_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return be64_to_cpu(ptr->l) == NULLDFSBNO; + return ptr->l == cpu_to_be64(NULLDFSBNO); else - return be32_to_cpu(ptr->s) == NULLAGBLOCK; + return ptr->s == cpu_to_be32(NULLAGBLOCK); } STATIC void @@ -923,12 +922,12 @@ xfs_btree_ptr_to_daddr( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO); + ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); } else { ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); - ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, be32_to_cpu(ptr->s)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 82fafc66bd1f..8d05a6a46ce3 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -199,25 +199,6 @@ struct xfs_btree_ops { union xfs_btree_rec *r1, union xfs_btree_rec *r2); #endif - - /* btree tracing */ -#ifdef XFS_BTREE_TRACE - void (*trace_enter)(struct xfs_btree_cur *, const char *, - char *, int, int, __psunsigned_t, - __psunsigned_t, __psunsigned_t, - __psunsigned_t, __psunsigned_t, - __psunsigned_t, __psunsigned_t, - __psunsigned_t, __psunsigned_t, - __psunsigned_t, __psunsigned_t); - void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *, - __uint64_t *, __uint64_t *); - void (*trace_key)(struct xfs_btree_cur *, - union xfs_btree_key *, __uint64_t *, - __uint64_t *); - void (*trace_record)(struct xfs_btree_cur *, - union xfs_btree_rec *, __uint64_t *, - __uint64_t *, __uint64_t *); -#endif }; /* @@ -452,4 +433,23 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) +/* + * Trace hooks. Currently not implemented as they need to be ported + * over to the generic tracing functionality, which is some effort. + * + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c deleted file mode 100644 index 44ff942a0fda..000000000000 --- a/fs/xfs/xfs_btree_trace.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2008 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_types.h" -#include "xfs_inum.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_btree_trace.h" - -STATIC void -xfs_btree_trace_ptr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr ptr, - __psunsigned_t *high, - __psunsigned_t *low) -{ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - __u64 val = be64_to_cpu(ptr.l); - *high = val >> 32; - *low = (int)val; - } else { - *high = 0; - *low = be32_to_cpu(ptr.s); - } -} - -/* - * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. - */ -void -xfs_btree_trace_argbi( - const char *func, - struct xfs_btree_cur *cur, - struct xfs_buf *b, - int i, - int line) -{ - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI, - line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0, - 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for a buffer & 2 integer args. - */ -void -xfs_btree_trace_argbii( - const char *func, - struct xfs_btree_cur *cur, - struct xfs_buf *b, - int i0, - int i1, - int line) -{ - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII, - line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0, - 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for 3 block-length args - * and an integer arg. - */ -void -xfs_btree_trace_argfffi( - const char *func, - struct xfs_btree_cur *cur, - xfs_dfiloff_t o, - xfs_dfsbno_t b, - xfs_dfilblks_t i, - int j, - int line) -{ - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI, - line, - o >> 32, (int)o, - b >> 32, (int)b, - i >> 32, (int)i, - (int)j, 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for one integer arg. - */ -void -xfs_btree_trace_argi( - const char *func, - struct xfs_btree_cur *cur, - int i, - int line) -{ - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI, - line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, key. - */ -void -xfs_btree_trace_argipk( - const char *func, - struct xfs_btree_cur *cur, - int i, - union xfs_btree_ptr ptr, - union xfs_btree_key *key, - int line) -{ - __psunsigned_t high, low; - __uint64_t l0, l1; - - xfs_btree_trace_ptr(cur, ptr, &high, &low); - cur->bc_ops->trace_key(cur, key, &l0, &l1); - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK, - line, i, high, low, - l0 >> 32, (int)l0, - l1 >> 32, (int)l1, - 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, rec. - */ -void -xfs_btree_trace_argipr( - const char *func, - struct xfs_btree_cur *cur, - int i, - union xfs_btree_ptr ptr, - union xfs_btree_rec *rec, - int line) -{ - __psunsigned_t high, low; - __uint64_t l0, l1, l2; - - xfs_btree_trace_ptr(cur, ptr, &high, &low); - cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR, - line, i, - high, low, - l0 >> 32, (int)l0, - l1 >> 32, (int)l1, - l2 >> 32, (int)l2, - 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, key. - */ -void -xfs_btree_trace_argik( - const char *func, - struct xfs_btree_cur *cur, - int i, - union xfs_btree_key *key, - int line) -{ - __uint64_t l0, l1; - - cur->bc_ops->trace_key(cur, key, &l0, &l1); - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK, - line, i, - l0 >> 32, (int)l0, - l1 >> 32, (int)l1, - 0, 0, 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for record. - */ -void -xfs_btree_trace_argr( - const char *func, - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - int line) -{ - __uint64_t l0, l1, l2; - - cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); - cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR, - line, - l0 >> 32, (int)l0, - l1 >> 32, (int)l1, - l2 >> 32, (int)l2, - 0, 0, 0, 0, 0); -} - -/* - * Add a trace buffer entry for the cursor/operation. - */ -void -xfs_btree_trace_cursor( - const char *func, - struct xfs_btree_cur *cur, - int type, - int line) -{ - __uint32_t s0; - __uint64_t l0, l1; - char *s; - - switch (type) { - case XBT_ARGS: - s = "args"; - break; - case XBT_ENTRY: - s = "entry"; - break; - case XBT_ERROR: - s = "error"; - break; - case XBT_EXIT: - s = "exit"; - break; - default: - s = "unknown"; - break; - } - - cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1); - cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line, - s0, - l0 >> 32, (int)l0, - l1 >> 32, (int)l1, - (__psunsigned_t)cur->bc_bufs[0], - (__psunsigned_t)cur->bc_bufs[1], - (__psunsigned_t)cur->bc_bufs[2], - (__psunsigned_t)cur->bc_bufs[3], - (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], - (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); -} diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h deleted file mode 100644 index 2d8a309873ea..000000000000 --- a/fs/xfs/xfs_btree_trace.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2008 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_BTREE_TRACE_H__ -#define __XFS_BTREE_TRACE_H__ - -struct xfs_btree_cur; -struct xfs_buf; - - -/* - * Trace hooks. - * i,j = integer (32 bit) - * b = btree block buffer (xfs_buf_t) - * p = btree ptr - * r = btree record - * k = btree key - */ - -#ifdef XFS_BTREE_TRACE - -/* - * Trace buffer entry types. - */ -#define XFS_BTREE_KTRACE_ARGBI 1 -#define XFS_BTREE_KTRACE_ARGBII 2 -#define XFS_BTREE_KTRACE_ARGFFFI 3 -#define XFS_BTREE_KTRACE_ARGI 4 -#define XFS_BTREE_KTRACE_ARGIPK 5 -#define XFS_BTREE_KTRACE_ARGIPR 6 -#define XFS_BTREE_KTRACE_ARGIK 7 -#define XFS_BTREE_KTRACE_ARGR 8 -#define XFS_BTREE_KTRACE_CUR 9 - -/* - * Sub-types for cursor traces. - */ -#define XBT_ARGS 0 -#define XBT_ENTRY 1 -#define XBT_ERROR 2 -#define XBT_EXIT 3 - -void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *, - struct xfs_buf *, int, int); -void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *, - struct xfs_buf *, int, int, int); -void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int); -void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int, - union xfs_btree_ptr, union xfs_btree_key *, int); -void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int, - union xfs_btree_ptr, union xfs_btree_rec *, int); -void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int, - union xfs_btree_key *, int); -void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *, - union xfs_btree_rec *, int); -void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int); - -#define XFS_BTREE_TRACE_ARGBI(c, b, i) \ - xfs_btree_trace_argbi(__func__, c, b, i, __LINE__) -#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \ - xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__) -#define XFS_BTREE_TRACE_ARGI(c, i) \ - xfs_btree_trace_argi(__func__, c, i, __LINE__) -#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \ - xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__) -#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \ - xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__) -#define XFS_BTREE_TRACE_ARGIK(c, i, k) \ - xfs_btree_trace_argik(__func__, c, i, k, __LINE__) -#define XFS_BTREE_TRACE_ARGR(c, r) \ - xfs_btree_trace_argr(__func__, c, r, __LINE__) -#define XFS_BTREE_TRACE_CURSOR(c, t) \ - xfs_btree_trace_cursor(__func__, c, t, __LINE__) -#else -#define XFS_BTREE_TRACE_ARGBI(c, b, i) -#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) -#define XFS_BTREE_TRACE_ARGI(c, i) -#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) -#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) -#define XFS_BTREE_TRACE_ARGIK(c, i, k) -#define XFS_BTREE_TRACE_ARGR(c, r) -#define XFS_BTREE_TRACE_CURSOR(c, t) -#endif /* XFS_BTREE_TRACE */ - -#endif /* __XFS_BTREE_TRACE_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7b7e005e3dcc..88492916c3dc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -90,13 +90,11 @@ xfs_buf_item_flush_log_debug( uint first, uint last) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; uint nbytes; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { + if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) return; - } ASSERT(bip->bli_logged != NULL); nbytes = last - first + 1; @@ -408,7 +406,7 @@ xfs_buf_item_unpin( int stale = bip->bli_flags & XFS_BLI_STALE; int freed; - ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(bp->b_fspriv == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_buf_item_unpin(bip); @@ -420,7 +418,7 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); @@ -443,7 +441,7 @@ xfs_buf_item_unpin( * Since the transaction no longer refers to the buffer, * the buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + bp->b_transp = NULL; } /* @@ -454,13 +452,13 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); xfs_buf_item_relse(bp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); } @@ -483,7 +481,7 @@ xfs_buf_item_trylock( if (XFS_BUF_ISPINNED(bp)) return XFS_ITEM_PINNED; - if (!XFS_BUF_CPSEMA(bp)) + if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; /* take a reference to the buffer. */ @@ -525,7 +523,7 @@ xfs_buf_item_unlock( uint hold; /* Clear the buffer's association with this transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + bp->b_transp = NULL; /* * If this is a transaction abort, don't return early. Instead, allow @@ -684,7 +682,7 @@ xfs_buf_item_init( xfs_buf_t *bp, xfs_mount_t *mp) { - xfs_log_item_t *lip; + xfs_log_item_t *lip = bp->b_fspriv; xfs_buf_log_item_t *bip; int chunks; int map_size; @@ -696,12 +694,8 @@ xfs_buf_item_init( * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - if (lip->li_type == XFS_LI_BUF) { - return; - } - } + if (lip != NULL && lip->li_type == XFS_LI_BUF) + return; /* * chunks is the number of XFS_BLF_CHUNK size pieces @@ -740,11 +734,9 @@ xfs_buf_item_init( * Put the buf item into the list of items attached to the * buffer at the front. */ - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - bip->bli_item.li_bio_list = - XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - } - XFS_BUF_SET_FSPRIVATE(bp, bip); + if (bp->b_fspriv) + bip->bli_item.li_bio_list = bp->b_fspriv; + bp->b_fspriv = bip; } @@ -876,12 +868,11 @@ xfs_buf_item_relse( trace_xfs_buf_item_relse(bp, _RET_IP_); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); - if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && - (XFS_BUF_IODONE_FUNC(bp) != NULL)) { - XFS_BUF_CLR_IODONE_FUNC(bp); - } + bip = bp->b_fspriv; + bp->b_fspriv = bip->bli_item.li_bio_list; + if (bp->b_fspriv == NULL) + bp->b_iodone = NULL; + xfs_buf_rele(bp); xfs_buf_item_free(bip); } @@ -905,20 +896,20 @@ xfs_buf_attach_iodone( xfs_log_item_t *head_lip; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + head_lip = bp->b_fspriv; + if (head_lip) { lip->li_bio_list = head_lip->li_bio_list; head_lip->li_bio_list = lip; } else { - XFS_BUF_SET_FSPRIVATE(bp, lip); + bp->b_fspriv = lip; } - ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || - (XFS_BUF_IODONE_FUNC(bp) == NULL)); - XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; } /* @@ -939,8 +930,8 @@ xfs_buf_do_callbacks( { struct xfs_log_item *lip; - while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) { - XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list); + while ((lip = bp->b_fspriv) != NULL) { + bp->b_fspriv = lip->li_bio_list; ASSERT(lip->li_cb != NULL); /* * Clear the next pointer so we don't have any @@ -1007,7 +998,7 @@ xfs_buf_iodone_callbacks( XFS_BUF_DONE(bp); XFS_BUF_SET_START(bp); } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); + ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); xfs_buf_relse(bp); return; @@ -1026,8 +1017,8 @@ xfs_buf_iodone_callbacks( do_callbacks: xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6102ac6d1dff..2925726529f8 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -24,11 +24,12 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" +#include "xfs_dir2.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -36,10 +37,6 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -89,7 +86,7 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, */ STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); +STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps); STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); @@ -321,11 +318,11 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(bp != NULL); node = bp->data; oldroot = blk1->bp->data; - if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) { + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - (char *)oldroot); } else { - ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); leaf = (xfs_dir2_leaf_t *)oldroot; size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - (char *)leaf); @@ -352,7 +349,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, node->hdr.count = cpu_to_be16(2); #ifdef DEBUG - if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) { + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { ASSERT(blk1->blkno >= mp->m_dirleafblk && blk1->blkno < mp->m_dirfreeblk); ASSERT(blk2->blkno >= mp->m_dirleafblk && @@ -384,7 +381,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, int useextra; node = oldblk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); /* * With V2 dirs the extra block is data or freespace. @@ -483,8 +480,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, node1 = node2; node2 = tmpnode; } - ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; if (count == 0) return; @@ -578,7 +575,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, int tmp; node = oldblk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) @@ -714,7 +711,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); oldroot = root_blk->bp->data; - ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(!oldroot->hdr.info.forw); ASSERT(!oldroot->hdr.info.back); @@ -737,10 +734,10 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) ASSERT(bp != NULL); blkinfo = bp->data; if (be16_to_cpu(oldroot->hdr.level) == 1) { - ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); } else { - ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC); + ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); } ASSERT(!blkinfo->forw); ASSERT(!blkinfo->back); @@ -776,7 +773,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC); + ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; count = be16_to_cpu(node->hdr.count); if (count > (state->node_ents >> 1)) { @@ -836,7 +833,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) count -= state->node_ents >> 2; count -= be16_to_cpu(node->hdr.count); node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); count -= be16_to_cpu(node->hdr.count); xfs_da_brelse(state->args->trans, bp); if (count >= 0) @@ -911,7 +908,7 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) } for (blk--, level--; level >= 0; blk--, level--) { node = blk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); btree = &node->btree[ blk->index ]; if (be32_to_cpu(btree->hashval) == lasthash) break; @@ -979,8 +976,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, drop_node = drop_blk->bp->data; save_node = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); tp = state->args->trans; /* @@ -1278,8 +1275,8 @@ xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) node1 = node1_bp->data; node2 = node2_bp->data; - ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) && - (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC)); + ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && + node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || @@ -1299,7 +1296,7 @@ xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) xfs_da_intnode_t *node; node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (count) *count = be16_to_cpu(node->hdr.count); if (!node->hdr.count) @@ -1412,7 +1409,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, for (blk = &path->blk[level]; level >= 0; blk--, level--) { ASSERT(blk->bp != NULL); node = blk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { blk->index++; blkno = be32_to_cpu(node->btree[blk->index].before); @@ -1451,9 +1448,9 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, return(error); ASSERT(blk->bp != NULL); info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC || - be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); blk->magic = be16_to_cpu(info->magic); if (blk->magic == XFS_DA_NODE_MAGIC) { node = (xfs_da_intnode_t *)info; @@ -1546,79 +1543,62 @@ const struct xfs_nameops xfs_default_nameops = { .compname = xfs_da_compname }; -/* - * Add a block to the btree ahead of the file. - * Return the new block number to the caller. - */ int -xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) +xfs_da_grow_inode_int( + struct xfs_da_args *args, + xfs_fileoff_t *bno, + int count) { - xfs_fileoff_t bno, b; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_inode_t *dp; - int nmap, error, w, count, c, got, i, mapi; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_drfsbno_t nblks; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + int w = args->whichfork; + xfs_drfsbno_t nblks = dp->i_d.di_nblocks; + struct xfs_bmbt_irec map, *mapp; + int nmap, error, got, i, mapi; - dp = args->dp; - mp = dp->i_mount; - w = args->whichfork; - tp = args->trans; - nblks = dp->i_d.di_nblocks; - - /* - * For new directories adjust the file offset and block count. - */ - if (w == XFS_DATA_FORK) { - bno = mp->m_dirleafblk; - count = mp->m_dirblkfsbs; - } else { - bno = 0; - count = 1; - } /* * Find a spot in the file space to put the new block. */ - if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) + error = xfs_bmap_first_unused(tp, dp, count, bno, w); + if (error) return error; - if (w == XFS_DATA_FORK) - ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk); + /* * Try mapping it in one filesystem block. */ nmap = 1; ASSERT(args->firstblock != NULL); - if ((error = xfs_bmapi(tp, dp, bno, count, + error = xfs_bmapi(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist))) { + args->flist); + if (error) return error; - } + ASSERT(nmap <= 1); if (nmap == 1) { mapp = ↦ mapi = 1; - } - /* - * If we didn't get it and the block might work if fragmented, - * try without the CONTIG flag. Loop until we get it all. - */ - else if (nmap == 0 && count > 1) { + } else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; + int c; + + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); - for (b = bno, mapi = 0; b < bno + count; ) { + for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); - c = (int)(bno + count - b); - if ((error = xfs_bmapi(tp, dp, b, c, + c = (int)(*bno + count - b); + error = xfs_bmapi(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist))) { - kmem_free(mapp); - return error; - } + &mapp[mapi], &nmap, args->flist); + if (error) + goto out_free_map; if (nmap < 1) break; mapi += nmap; @@ -1629,24 +1609,53 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) mapi = 0; mapp = NULL; } + /* * Count the blocks we got, make sure it matches the total. */ for (i = 0, got = 0; i < mapi; i++) got += mapp[i].br_blockcount; - if (got != count || mapp[0].br_startoff != bno || + if (got != count || mapp[0].br_startoff != *bno || mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != - bno + count) { - if (mapp != &map) - kmem_free(mapp); - return XFS_ERROR(ENOSPC); + *bno + count) { + error = XFS_ERROR(ENOSPC); + goto out_free_map; } - if (mapp != &map) - kmem_free(mapp); + /* account for newly allocated blocks in reserved blocks total */ args->total -= dp->i_d.di_nblocks - nblks; - *new_blkno = (xfs_dablk_t)bno; - return 0; + +out_free_map: + if (mapp != &map) + kmem_free(mapp); + return error; +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode( + struct xfs_da_args *args, + xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno; + int count; + int error; + + if (args->whichfork == XFS_DATA_FORK) { + bno = args->dp->i_mount->m_dirleafblk; + count = args->dp->i_mount->m_dirblkfsbs; + } else { + bno = 0; + count = 1; + } + + error = xfs_da_grow_inode_int(args, &bno, count); + if (!error) + *new_blkno = (xfs_dablk_t)bno; + return error; } /* @@ -1704,12 +1713,12 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, /* * Get values from the moved block. */ - if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; dead_level = 0; dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); } else { - ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC); + ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); dead_node = (xfs_da_intnode_t *)dead_info; dead_level = be16_to_cpu(dead_node->hdr.level); dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); @@ -1768,8 +1777,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) goto done; par_node = par_buf->data; - if (unlikely( - be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC || + if (unlikely(par_node->hdr.info.magic != + cpu_to_be16(XFS_DA_NODE_MAGIC) || (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); @@ -1820,7 +1829,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, par_node = par_buf->data; if (unlikely( be16_to_cpu(par_node->hdr.level) != level || - be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) { + par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); @@ -1930,8 +1939,7 @@ xfs_da_do_buf( xfs_daddr_t *mappedbnop, xfs_dabuf_t **bpp, int whichfork, - int caller, - inst_t *ra) + int caller) { xfs_buf_t *bp = NULL; xfs_buf_t **bplist; @@ -2070,25 +2078,22 @@ xfs_da_do_buf( * Build a dabuf structure. */ if (bplist) { - rbp = xfs_da_buf_make(nbplist, bplist, ra); + rbp = xfs_da_buf_make(nbplist, bplist); } else if (bp) - rbp = xfs_da_buf_make(1, &bp, ra); + rbp = xfs_da_buf_make(1, &bp); else rbp = NULL; /* * For read_buf, check the magic number. */ if (caller == 1) { - xfs_dir2_data_t *data; - xfs_dir2_free_t *free; - xfs_da_blkinfo_t *info; + xfs_dir2_data_hdr_t *hdr = rbp->data; + xfs_dir2_free_t *free = rbp->data; + xfs_da_blkinfo_t *info = rbp->data; uint magic, magic1; - info = rbp->data; - data = rbp->data; - free = rbp->data; magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(data->hdr.magic); + magic1 = be32_to_cpu(hdr->magic); if (unlikely( XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && (magic != XFS_ATTR_LEAF_MAGIC) && @@ -2096,7 +2101,7 @@ xfs_da_do_buf( (magic != XFS_DIR2_LEAFN_MAGIC) && (magic1 != XFS_DIR2_BLOCK_MAGIC) && (magic1 != XFS_DIR2_DATA_MAGIC) && - (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC), + (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); @@ -2143,8 +2148,7 @@ xfs_da_get_buf( xfs_dabuf_t **bpp, int whichfork) { - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0, - (inst_t *)__return_address); + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0); } /* @@ -2159,8 +2163,7 @@ xfs_da_read_buf( xfs_dabuf_t **bpp, int whichfork) { - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1, - (inst_t *)__return_address); + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1); } /* @@ -2176,8 +2179,7 @@ xfs_da_reada_buf( xfs_daddr_t rval; rval = -1; - if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3, - (inst_t *)__return_address)) + if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3)) return -1; else return rval; @@ -2235,17 +2237,12 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } -#ifdef XFS_DABUF_DEBUG -xfs_dabuf_t *xfs_dabuf_global_list; -static DEFINE_SPINLOCK(xfs_dabuf_global_lock); -#endif - /* * Create a dabuf. */ /* ARGSUSED */ STATIC xfs_dabuf_t * -xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) +xfs_da_buf_make(int nbuf, xfs_buf_t **bps) { xfs_buf_t *bp; xfs_dabuf_t *dabuf; @@ -2257,11 +2254,6 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) else dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; -#ifdef XFS_DABUF_DEBUG - dabuf->ra = ra; - dabuf->target = XFS_BUF_TARGET(bps[0]); - dabuf->blkno = XFS_BUF_ADDR(bps[0]); -#endif if (nbuf == 1) { dabuf->nbuf = 1; bp = bps[0]; @@ -2281,23 +2273,6 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) XFS_BUF_COUNT(bp)); } } -#ifdef XFS_DABUF_DEBUG - { - xfs_dabuf_t *p; - - spin_lock(&xfs_dabuf_global_lock); - for (p = xfs_dabuf_global_list; p; p = p->next) { - ASSERT(p->blkno != dabuf->blkno || - p->target != dabuf->target); - } - dabuf->prev = NULL; - if (xfs_dabuf_global_list) - xfs_dabuf_global_list->prev = dabuf; - dabuf->next = xfs_dabuf_global_list; - xfs_dabuf_global_list = dabuf; - spin_unlock(&xfs_dabuf_global_lock); - } -#endif return dabuf; } @@ -2333,25 +2308,12 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf) ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); if (dabuf->dirty) xfs_da_buf_clean(dabuf); - if (dabuf->nbuf > 1) + if (dabuf->nbuf > 1) { kmem_free(dabuf->data); -#ifdef XFS_DABUF_DEBUG - { - spin_lock(&xfs_dabuf_global_lock); - if (dabuf->prev) - dabuf->prev->next = dabuf->next; - else - xfs_dabuf_global_list = dabuf->next; - if (dabuf->next) - dabuf->next->prev = dabuf->prev; - spin_unlock(&xfs_dabuf_global_lock); - } - memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf)); -#endif - if (dabuf->nbuf == 1) - kmem_zone_free(xfs_dabuf_zone, dabuf); - else kmem_free(dabuf); + } else { + kmem_zone_free(xfs_dabuf_zone, dabuf); + } } /* diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index fe9f5a8c1d2a..dbf7c074ae73 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -145,22 +145,11 @@ typedef struct xfs_dabuf { short dirty; /* data needs to be copied back */ short bbcount; /* how large is data in bbs */ void *data; /* pointer for buffers' data */ -#ifdef XFS_DABUF_DEBUG - inst_t *ra; /* return address of caller to make */ - struct xfs_dabuf *next; /* next in global chain */ - struct xfs_dabuf *prev; /* previous in global chain */ - struct xfs_buftarg *target; /* device for buffer */ - xfs_daddr_t blkno; /* daddr first in bps[0] */ -#endif struct xfs_buf *bps[1]; /* actually nbuf of these */ } xfs_dabuf_t; #define XFS_DA_BUF_SIZE(n) \ (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) -#ifdef XFS_DABUF_DEBUG -extern xfs_dabuf_t *xfs_dabuf_global_list; -#endif - /* * Storage for holding state during Btree searches and split/join ops. * @@ -248,6 +237,8 @@ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, * Utility routines. */ int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, + int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, xfs_dabuf_t **bp, int whichfork); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index dba7a71cedf3..4580ce00aeb4 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -24,20 +24,17 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" +#include "xfs_dir2.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -122,15 +119,15 @@ int xfs_dir_isempty( xfs_inode_t *dp) { - xfs_dir2_sf_t *sfp; + xfs_dir2_sf_hdr_t *sfp; ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) return 0; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - return !sfp->hdr.count; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + return !sfp->count; } /* @@ -500,129 +497,34 @@ xfs_dir_canenter( /* * Add a block to the directory. - * This routine is for data and free blocks, not leaf/node blocks - * which are handled by xfs_da_grow_inode. + * + * This routine is for data and free blocks, not leaf/node blocks which are + * handled by xfs_da_grow_inode. */ int xfs_dir2_grow_inode( - xfs_da_args_t *args, - int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ - xfs_dir2_db_t *dbp) /* out: block number added */ + struct xfs_da_args *args, + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ { - xfs_fileoff_t bno; /* directory offset of new block */ - int count; /* count of filesystem blocks */ - xfs_inode_t *dp; /* incore directory inode */ - int error; - int got; /* blocks actually mapped */ - int i; - xfs_bmbt_irec_t map; /* single structure for bmap */ - int mapi; /* mapping index */ - xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */ - xfs_mount_t *mp; - int nmap; /* number of bmap entries */ - xfs_trans_t *tp; - xfs_drfsbno_t nblks; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + int error; trace_xfs_dir2_grow_inode(args, space); - dp = args->dp; - tp = args->trans; - mp = dp->i_mount; - nblks = dp->i_d.di_nblocks; /* * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); count = mp->m_dirblkfsbs; - /* - * Find the first hole for our block. - */ - if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) - return error; - nmap = 1; - ASSERT(args->firstblock != NULL); - /* - * Try mapping the new block contiguously (one extent). - */ - if ((error = xfs_bmapi(tp, dp, bno, count, - XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, - args->firstblock, args->total, &map, &nmap, - args->flist))) - return error; - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } - /* - * Didn't work and this is a multiple-fsb directory block. - * Try again with contiguous flag turned on. - */ - else if (nmap == 0 && count > 1) { - xfs_fileoff_t b; /* current file offset */ - /* - * Space for maximum number of mappings. - */ - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); - /* - * Iterate until we get to the end of our block. - */ - for (b = bno, mapi = 0; b < bno + count; ) { - int c; /* current fsb count */ - - /* - * Can't map more than MAX_NMAP at once. - */ - nmap = MIN(XFS_BMAP_MAX_NMAP, count); - c = (int)(bno + count - b); - if ((error = xfs_bmapi(tp, dp, b, c, - XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, - args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist))) { - kmem_free(mapp); - return error; - } - if (nmap < 1) - break; - /* - * Add this bunch into our table, go to the next offset. - */ - mapi += nmap; - b = mapp[mapi - 1].br_startoff + - mapp[mapi - 1].br_blockcount; - } - } - /* - * Didn't work. - */ - else { - mapi = 0; - mapp = NULL; - } - /* - * See how many fsb's we got. - */ - for (i = 0, got = 0; i < mapi; i++) - got += mapp[i].br_blockcount; - /* - * Didn't get enough fsb's, or the first/last block's are wrong. - */ - if (got != count || mapp[0].br_startoff != bno || - mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != - bno + count) { - if (mapp != &map) - kmem_free(mapp); - return XFS_ERROR(ENOSPC); - } - /* - * Done with the temporary mapping table. - */ - if (mapp != &map) - kmem_free(mapp); + error = xfs_da_grow_inode_int(args, &bno, count); + if (error) + return error; - /* account for newly allocated blocks in reserved blocks total */ - args->total -= dp->i_d.di_nblocks - nblks; *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); /* @@ -634,7 +536,7 @@ xfs_dir2_grow_inode( size = XFS_FSB_TO_B(mp, bno + count); if (size > dp->i_d.di_size) { dp->i_d.di_size = size; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } return 0; diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 74a3b1057685..e937d9991c18 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -16,49 +16,14 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DIR2_H__ -#define __XFS_DIR2_H__ +#define __XFS_DIR2_H__ -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_put_args; struct xfs_bmap_free; +struct xfs_da_args; struct xfs_inode; struct xfs_mount; struct xfs_trans; -/* - * Directory version 2. - * There are 4 possible formats: - * shortform - * single block - data with embedded leaf at the end - * multiple data blocks, single leaf+freeindex block - * data blocks, node&leaf blocks (btree), freeindex blocks - * - * The shortform format is in xfs_dir2_sf.h. - * The single block format is in xfs_dir2_block.h. - * The data block format is in xfs_dir2_data.h. - * The leaf and freeindex block formats are in xfs_dir2_leaf.h. - * Node blocks are the same as the other version, in xfs_da_btree.h. - */ - -/* - * Byte offset in data block and shortform entry. - */ -typedef __uint16_t xfs_dir2_data_off_t; -#define NULLDATAOFF 0xffffU -typedef uint xfs_dir2_data_aoff_t; /* argument form */ - -/* - * Directory block number (logical dirblk in file) - */ -typedef __uint32_t xfs_dir2_db_t; - -/* - * Byte offset in a directory. - */ -typedef xfs_off_t xfs_dir2_off_t; - extern struct xfs_name xfs_name_dotdot; /* @@ -86,21 +51,10 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, uint resblks); -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); /* - * Utility routines for v2 directories. + * Direct call from the bmap code, bypassing the generic directory layer. */ -extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, - xfs_dir2_db_t *dbp); -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, - int *vp); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, - int *vp); -extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_dabuf *bp); - -extern int xfs_dir_cilookup_result(struct xfs_da_args *args, - const unsigned char *name, int len); +extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 580d99cef9e7..9245e029b8ea 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -23,17 +23,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -67,7 +64,7 @@ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { xfs_dir2_data_free_t *bf; /* bestfree table in block */ - xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ xfs_dabuf_t *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -105,13 +102,13 @@ xfs_dir2_block_addname( return error; } ASSERT(bp != NULL); - block = bp->data; + hdr = bp->data; /* * Check the magic number, corrupted if wrong. */ - if (unlikely(be32_to_cpu(block->hdr.magic) != XFS_DIR2_BLOCK_MAGIC)) { + if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, block); + XFS_ERRLEVEL_LOW, mp, hdr); xfs_da_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); } @@ -119,8 +116,8 @@ xfs_dir2_block_addname( /* * Set up pointers to parts of the block. */ - bf = block->hdr.bestfree; - btp = xfs_dir2_block_tail_p(mp, block); + bf = hdr->bestfree; + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * No stale entries? Need space for entry and new leaf. @@ -133,7 +130,7 @@ xfs_dir2_block_addname( /* * Data object just before the first leaf entry. */ - enddup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then can't do this add without cleaning up: * the space before the first leaf entry needs to be free so it @@ -146,7 +143,7 @@ xfs_dir2_block_addname( */ else { dup = (xfs_dir2_data_unused_t *) - ((char *)block + be16_to_cpu(bf[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); if (dup == enddup) { /* * It is the biggest freespace, is it too small @@ -159,7 +156,7 @@ xfs_dir2_block_addname( */ if (be16_to_cpu(bf[1].length) >= len) dup = (xfs_dir2_data_unused_t *) - ((char *)block + + ((char *)hdr + be16_to_cpu(bf[1].offset)); else dup = NULL; @@ -182,7 +179,7 @@ xfs_dir2_block_addname( */ else if (be16_to_cpu(bf[0].length) >= len) { dup = (xfs_dir2_data_unused_t *) - ((char *)block + be16_to_cpu(bf[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); compact = 0; } /* @@ -196,7 +193,7 @@ xfs_dir2_block_addname( /* * Data object just before the first leaf entry. */ - dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then the data will go where the * leaf data starts now, if it works at all. @@ -255,7 +252,8 @@ xfs_dir2_block_addname( highstale = lfloghigh = -1; fromidx >= 0; fromidx--) { - if (be32_to_cpu(blp[fromidx].address) == XFS_DIR2_NULL_DATAPTR) { + if (blp[fromidx].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (highstale == -1) highstale = toidx; else { @@ -272,7 +270,7 @@ xfs_dir2_block_addname( lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), &needlog, &needscan); blp += be32_to_cpu(btp->stale) - 1; @@ -282,7 +280,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); needscan = 0; } } @@ -318,7 +316,7 @@ xfs_dir2_block_addname( */ xfs_dir2_data_use_free(tp, bp, enddup, (xfs_dir2_data_aoff_t) - ((char *)enddup - (char *)block + be16_to_cpu(enddup->length) - + ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)), (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, &needscan); @@ -331,8 +329,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, - &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); needscan = 0; } /* @@ -353,12 +350,14 @@ xfs_dir2_block_addname( else { for (lowstale = mid; lowstale >= 0 && - be32_to_cpu(blp[lowstale].address) != XFS_DIR2_NULL_DATAPTR; + blp[lowstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR); lowstale--) continue; for (highstale = mid + 1; highstale < be32_to_cpu(btp->count) && - be32_to_cpu(blp[highstale].address) != XFS_DIR2_NULL_DATAPTR && + blp[highstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && (lowstale < 0 || mid - lowstale > highstale - mid); highstale++) continue; @@ -397,13 +396,13 @@ xfs_dir2_block_addname( */ blp[mid].hashval = cpu_to_be32(args->hashval); blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, - (char *)dep - (char *)block)); + (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ xfs_dir2_data_use_free(tp, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); /* * Create the new data entry. @@ -412,12 +411,12 @@ xfs_dir2_block_addname( dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); xfs_dir2_block_log_tail(tp, bp); @@ -437,7 +436,7 @@ xfs_dir2_block_getdents( xfs_off_t *offset, filldir_t filldir) { - xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dabuf_t *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ @@ -470,13 +469,13 @@ xfs_dir2_block_getdents( * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); - block = bp->data; + hdr = bp->data; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. */ - btp = xfs_dir2_block_tail_p(mp, block); - ptr = (char *)block->u; + btp = xfs_dir2_block_tail_p(mp, hdr); + ptr = (char *)(hdr + 1); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* @@ -502,11 +501,11 @@ xfs_dir2_block_getdents( /* * The entry is before the desired starting point, skip it. */ - if ((char *)dep - (char *)block < wantoff) + if ((char *)dep - (char *)hdr < wantoff) continue; cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (char *)dep - (char *)block); + (char *)dep - (char *)hdr); /* * If it didn't fit, set the final offset to here & return. @@ -540,17 +539,14 @@ xfs_dir2_block_log_leaf( int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { - xfs_dir2_block_t *block; /* directory block structure */ - xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_leaf_entry_t *blp; + xfs_dir2_block_tail_t *btp; - mp = tp->t_mountp; - block = bp->data; - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); blp = xfs_dir2_block_leaf_p(btp); - xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), - (uint)((char *)&blp[last + 1] - (char *)block - 1)); + xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } /* @@ -561,15 +557,12 @@ xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ xfs_dabuf_t *bp) /* block buffer */ { - xfs_dir2_block_t *block; /* directory block structure */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_block_tail_t *btp; - mp = tp->t_mountp; - block = bp->data; - btp = xfs_dir2_block_tail_p(mp, block); - xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), - (uint)((char *)(btp + 1) - (char *)block - 1)); + btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + (uint)((char *)(btp + 1) - (char *)hdr - 1)); } /* @@ -580,7 +573,7 @@ int /* error */ xfs_dir2_block_lookup( xfs_da_args_t *args) /* dir lookup arguments */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ xfs_dabuf_t *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -600,14 +593,14 @@ xfs_dir2_block_lookup( return error; dp = args->dp; mp = dp->i_mount; - block = bp->data; + hdr = bp->data; xfs_dir2_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ - dep = (xfs_dir2_data_entry_t *)((char *)block + + dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. @@ -628,7 +621,7 @@ xfs_dir2_block_lookup_int( int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ xfs_dabuf_t *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -654,9 +647,9 @@ xfs_dir2_block_lookup_int( return error; } ASSERT(bp != NULL); - block = bp->data; + hdr = bp->data; xfs_dir2_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. @@ -694,7 +687,7 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -733,7 +726,7 @@ int /* error */ xfs_dir2_block_removename( xfs_da_args_t *args) /* directory operation args */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ xfs_dabuf_t *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -760,20 +753,20 @@ xfs_dir2_block_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - block = bp->data; - btp = xfs_dir2_block_tail_p(mp, block); + hdr = bp->data; + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. @@ -789,15 +782,15 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); xfs_dir2_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ - if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > - XFS_IFORK_DSIZE(dp)) { + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) { xfs_da_buf_done(bp); return 0; } @@ -815,7 +808,7 @@ int /* error */ xfs_dir2_block_replace( xfs_da_args_t *args) /* directory operation args */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ xfs_dabuf_t *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -836,14 +829,14 @@ xfs_dir2_block_replace( } dp = args->dp; mp = dp->i_mount; - block = bp->data; - btp = xfs_dir2_block_tail_p(mp, block); + hdr = bp->data; + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. @@ -882,7 +875,7 @@ xfs_dir2_leaf_to_block( xfs_dabuf_t *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data entry */ @@ -906,7 +899,7 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * If there are data blocks other than the first one, take this @@ -917,7 +910,7 @@ xfs_dir2_leaf_to_block( while (dp->i_d.di_size > mp->m_dirblksize) { bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(block->hdr)) { + mp->m_dirblksize - (uint)sizeof(*hdr)) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -935,18 +928,18 @@ xfs_dir2_leaf_to_block( XFS_DATA_FORK))) { goto out; } - block = dbp->data; - ASSERT(be32_to_cpu(block->hdr.magic) == XFS_DIR2_DATA_MAGIC); + hdr = dbp->data; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); /* * Size of the "leaf" area in the block. */ - size = (uint)sizeof(block->tail) + + size = (uint)sizeof(xfs_dir2_block_tail_t) + (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); /* * Look at the last data entry. */ - tagp = (__be16 *)((char *)block + mp->m_dirblksize) - 1; - dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. */ @@ -958,7 +951,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; /* @@ -969,7 +962,7 @@ xfs_dir2_leaf_to_block( /* * Initialize the block tail. */ - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(mp, hdr); btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); @@ -978,7 +971,8 @@ xfs_dir2_leaf_to_block( */ lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + if (leaf->ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; lep[to++] = leaf->ents[from]; } @@ -988,7 +982,7 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); /* @@ -1002,8 +996,8 @@ xfs_dir2_leaf_to_block( /* * Now see if the resulting block can be shrunken to shortform. */ - if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > - XFS_IFORK_DSIZE(dp)) { + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) { error = 0; goto out; } @@ -1024,12 +1018,10 @@ xfs_dir2_sf_to_block( xfs_da_args_t *args) /* operation arguments */ { xfs_dir2_db_t blkno; /* dir-relative block # (0) */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ xfs_dabuf_t *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ - char *buf; /* sf buffer */ - int buf_len; xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ int dummy; /* trash */ @@ -1043,7 +1035,8 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ int offset; /* target block offset */ xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_name name; @@ -1061,32 +1054,30 @@ xfs_dir2_sf_to_block( ASSERT(XFS_FORCED_SHUTDOWN(mp)); return XFS_ERROR(EIO); } + + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + /* - * Copy the directory into the stack buffer. + * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ + sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); + memcpy(sfp, oldsfp, dp->i_df.if_bytes); - buf_len = dp->i_df.if_bytes; - buf = kmem_alloc(buf_len, KM_SLEEP); - - memcpy(buf, sfp, buf_len); - xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK); + xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); dp->i_d.di_size = 0; xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - /* - * Reset pointer - old sfp is gone. - */ - sfp = (xfs_dir2_sf_t *)buf; + /* * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) { - kmem_free(buf); + kmem_free(sfp); return error; } /* @@ -1094,21 +1085,21 @@ xfs_dir2_sf_to_block( */ error = xfs_dir2_data_init(args, blkno, &bp); if (error) { - kmem_free(buf); + kmem_free(sfp); return error; } - block = bp->data; - block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + hdr = bp->data; + hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* * Compute size of block "tail" area. */ i = (uint)sizeof(*btp) + - (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); /* * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)block->u; + dup = (xfs_dir2_data_unused_t *)(hdr + 1); needlog = needscan = 0; xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, &needscan); @@ -1116,50 +1107,51 @@ xfs_dir2_sf_to_block( /* * Fill in the tail. */ - btp = xfs_dir2_block_tail_p(mp, block); - btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ + btp = xfs_dir2_block_tail_p(mp, hdr); + btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); - endoffset = (uint)((char *)blp - (char *)block); + endoffset = (uint)((char *)blp - (char *)hdr); /* * Remove the freespace, we'll manage it. */ xfs_dir2_data_use_free(tp, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); /* * Create entry for . */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATA_DOT_OFFSET); + ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, - (char *)dep - (char *)block)); + (char *)dep - (char *)hdr)); /* * Create entry for .. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); - dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); + ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, - (char *)dep - (char *)block)); + (char *)dep - (char *)hdr)); offset = XFS_DIR2_DATA_FIRST_OFFSET; /* * Loop over existing entries, stuff them in. */ - if ((i = 0) == sfp->hdr.count) + i = 0; + if (!sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_firstentry(sfp); @@ -1179,43 +1171,40 @@ xfs_dir2_sf_to_block( * There should be a hole here, make one. */ if (offset < newoffset) { - dup = (xfs_dir2_data_unused_t *) - ((char *)block + offset); + dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( - ((char *)dup - (char *)block)); + ((char *)dup - (char *)hdr)); xfs_dir2_data_log_unused(tp, bp, dup); - (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, - dup, &dummy); + xfs_dir2_data_freeinsert(hdr, dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); - dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); + dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep)); dep->namelen = sfep->namelen; memcpy(dep->name, sfep->name, dep->namelen); tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> hashname(&name)); blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, - (char *)dep - (char *)block)); - offset = (int)((char *)(tagp + 1) - (char *)block); - if (++i == sfp->hdr.count) + (char *)dep - (char *)hdr)); + offset = (int)((char *)(tagp + 1) - (char *)hdr); + if (++i == sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(buf); + kmem_free(sfp); /* * Sort the leaf entries by hash value. */ diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h deleted file mode 100644 index 10e689676382..000000000000 --- a/fs/xfs/xfs_dir2_block.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_BLOCK_H__ -#define __XFS_DIR2_BLOCK_H__ - -/* - * xfs_dir2_block.h - * Directory version 2, single block format structures - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_data_hdr; -struct xfs_dir2_leaf_entry; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * The single block format is as follows: - * xfs_dir2_data_hdr_t structure - * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures - * xfs_dir2_leaf_entry_t structures - * xfs_dir2_block_tail_t structure - */ - -#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */ - -typedef struct xfs_dir2_block_tail { - __be32 count; /* count of leaf entries */ - __be32 stale; /* count of stale lf entries */ -} xfs_dir2_block_tail_t; - -/* - * Generic single-block structure, for xfs_db. - */ -typedef struct xfs_dir2_block { - xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */ - xfs_dir2_data_union_t u[1]; - xfs_dir2_leaf_entry_t leaf[1]; - xfs_dir2_block_tail_t tail; -} xfs_dir2_block_t; - -/* - * Pointer to the leaf header embedded in a data block (1-block format) - */ -static inline xfs_dir2_block_tail_t * -xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) -{ - return (((xfs_dir2_block_tail_t *) - ((char *)(block) + (mp)->m_dirblksize)) - 1); -} - -/* - * Pointer to the leaf entries embedded in a data block (1-block format) - */ -static inline struct xfs_dir2_leaf_entry * -xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) -{ - return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); -} - -/* - * Function declarations. - */ -extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_block_lookup(struct xfs_da_args *args); -extern int xfs_dir2_block_removename(struct xfs_da_args *args); -extern int xfs_dir2_block_replace(struct xfs_da_args *args); -extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); -extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); - -#endif /* __XFS_DIR2_BLOCK_H__ */ diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 921595b84f5b..5bbe2a8a023f 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -23,18 +23,18 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +STATIC xfs_dir2_data_free_t * +xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); + #ifdef DEBUG /* * Check the consistency of the data block. @@ -50,7 +50,7 @@ xfs_dir2_data_check( xfs_dir2_data_free_t *bf; /* bestfree table */ xfs_dir2_block_tail_t *btp=NULL; /* block tail */ int count; /* count of entries found */ - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ @@ -66,17 +66,19 @@ xfs_dir2_data_check( struct xfs_name name; mp = dp->i_mount; - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - bf = d->hdr.bestfree; - p = (char *)d->u; - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + hdr = bp->data; + bf = hdr->bestfree; + p = (char *)(hdr + 1); + + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else - endp = (char *)d + mp->m_dirblksize; + } else { + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + endp = (char *)hdr + mp->m_dirblksize; + } + count = lastfree = freeseen = 0; /* * Account for zero bestfree entries. @@ -108,8 +110,8 @@ xfs_dir2_data_check( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT(lastfree == 0); ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)d); - dfp = xfs_dir2_data_freefind(d, dup); + (char *)dup - (char *)hdr); + dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); ASSERT((freeseen & (1 << i)) == 0); @@ -132,13 +134,13 @@ xfs_dir2_data_check( ASSERT(dep->namelen != 0); ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)d); + (char *)dep - (char *)hdr); count++; lastfree = 0; - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)d)); + ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; hash = mp->m_dirnameops->hashname(&name); @@ -155,9 +157,10 @@ xfs_dir2_data_check( * Need to have seen all the entries and all the bestfree slots. */ ASSERT(freeseen == 7); - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { - if (be32_to_cpu(lep[i].address) == XFS_DIR2_NULL_DATAPTR) + if (lep[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); @@ -172,9 +175,9 @@ xfs_dir2_data_check( * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. */ -xfs_dir2_data_free_t * +STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind( - xfs_dir2_data_t *d, /* data block */ + xfs_dir2_data_hdr_t *hdr, /* data block */ xfs_dir2_data_unused_t *dup) /* data unused entry */ { xfs_dir2_data_free_t *dfp; /* bestfree entry */ @@ -184,17 +187,17 @@ xfs_dir2_data_freefind( int seenzero; /* saw a 0 bestfree entry */ #endif - off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d); + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); #if defined(DEBUG) && defined(__KERNEL__) /* * Validate some consistency in the bestfree table. * Check order, non-overlapping entries, and if we find the * one we're looking for it has to be exact. */ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0; - dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + for (dfp = &hdr->bestfree[0], seenzero = matched = 0; + dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -210,7 +213,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &d->hdr.bestfree[0]) + if (dfp > &hdr->bestfree[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -219,13 +222,13 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &d->hdr.bestfree[0]; - dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + for (dfp = &hdr->bestfree[0]; + dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; @@ -243,7 +246,7 @@ xfs_dir2_data_freefind( */ xfs_dir2_data_free_t * /* entry inserted */ xfs_dir2_data_freeinsert( - xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_hdr_t *hdr, /* data block pointer */ xfs_dir2_data_unused_t *dup, /* unused space */ int *loghead) /* log the data header (out) */ { @@ -251,12 +254,13 @@ xfs_dir2_data_freeinsert( xfs_dir2_data_free_t new; /* new bestfree entry */ #ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); #endif - dfp = d->hdr.bestfree; + dfp = hdr->bestfree; new.length = dup->length; - new.offset = cpu_to_be16((char *)dup - (char *)d); + new.offset = cpu_to_be16((char *)dup - (char *)hdr); + /* * Insert at position 0, 1, or 2; or not at all. */ @@ -286,36 +290,36 @@ xfs_dir2_data_freeinsert( */ STATIC void xfs_dir2_data_freeremove( - xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_hdr_t *hdr, /* data block header */ xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { #ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); #endif /* * It's the first entry, slide the next 2 up. */ - if (dfp == &d->hdr.bestfree[0]) { - d->hdr.bestfree[0] = d->hdr.bestfree[1]; - d->hdr.bestfree[1] = d->hdr.bestfree[2]; + if (dfp == &hdr->bestfree[0]) { + hdr->bestfree[0] = hdr->bestfree[1]; + hdr->bestfree[1] = hdr->bestfree[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &d->hdr.bestfree[1]) - d->hdr.bestfree[1] = d->hdr.bestfree[2]; + else if (dfp == &hdr->bestfree[1]) + hdr->bestfree[1] = hdr->bestfree[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &d->hdr.bestfree[2]); + ASSERT(dfp == &hdr->bestfree[2]); /* * Clear the 3rd entry, must be zero now. */ - d->hdr.bestfree[2].length = 0; - d->hdr.bestfree[2].offset = 0; + hdr->bestfree[2].length = 0; + hdr->bestfree[2].offset = 0; *loghead = 1; } @@ -325,7 +329,7 @@ xfs_dir2_data_freeremove( void xfs_dir2_data_freescan( xfs_mount_t *mp, /* filesystem mount point */ - xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_hdr_t *hdr, /* data block header */ int *loghead) /* out: log data header */ { xfs_dir2_block_tail_t *btp; /* block tail */ @@ -335,23 +339,23 @@ xfs_dir2_data_freescan( char *p; /* current entry pointer */ #ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); #endif /* * Start by clearing the table. */ - memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree)); + memset(hdr->bestfree, 0, sizeof(hdr->bestfree)); *loghead = 1; /* * Set up pointers. */ - p = (char *)d->u; - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + p = (char *)(hdr + 1); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(mp, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else - endp = (char *)d + mp->m_dirblksize; + endp = (char *)hdr + mp->m_dirblksize; /* * Loop over the block's entries. */ @@ -361,9 +365,9 @@ xfs_dir2_data_freescan( * If it's a free entry, insert it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT((char *)dup - (char *)d == + ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); - xfs_dir2_data_freeinsert(d, dup, loghead); + xfs_dir2_data_freeinsert(hdr, dup, loghead); p += be16_to_cpu(dup->length); } /* @@ -371,7 +375,7 @@ xfs_dir2_data_freescan( */ else { dep = (xfs_dir2_data_entry_t *)p; - ASSERT((char *)dep - (char *)d == + ASSERT((char *)dep - (char *)hdr == be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); p += xfs_dir2_data_entsize(dep->namelen); } @@ -389,7 +393,7 @@ xfs_dir2_data_init( xfs_dabuf_t **bpp) /* output block buffer */ { xfs_dabuf_t *bp; /* block buffer */ - xfs_dir2_data_t *d; /* pointer to block */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ int error; /* error return value */ @@ -410,26 +414,28 @@ xfs_dir2_data_init( return error; } ASSERT(bp != NULL); + /* * Initialize the header. */ - d = bp->data; - d->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - d->hdr.bestfree[0].offset = cpu_to_be16(sizeof(d->hdr)); + hdr = bp->data; + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - d->hdr.bestfree[i].length = 0; - d->hdr.bestfree[i].offset = 0; + hdr->bestfree[i].length = 0; + hdr->bestfree[i].offset = 0; } + /* * Set up an unused entry for the block's body. */ - dup = &d->u[0].unused; + dup = (xfs_dir2_data_unused_t *)(hdr + 1); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t=mp->m_dirblksize - (uint)sizeof(d->hdr); - d->hdr.bestfree[0].length = cpu_to_be16(t); + t = mp->m_dirblksize - (uint)sizeof(*hdr); + hdr->bestfree[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); - *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* * Log it and return it. */ @@ -448,14 +454,14 @@ xfs_dir2_data_log_entry( xfs_dabuf_t *bp, /* block buffer */ xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr = bp->data; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), + xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - - (char *)d - 1)); + (char *)hdr - 1)); } /* @@ -466,13 +472,12 @@ xfs_dir2_data_log_header( xfs_trans_t *tp, /* transaction pointer */ xfs_dabuf_t *bp) /* block buffer */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr = bp->data; - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d), - (uint)(sizeof(d->hdr) - 1)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + + xfs_da_log_buf(tp, bp, 0, sizeof(*hdr) - 1); } /* @@ -484,23 +489,23 @@ xfs_dir2_data_log_unused( xfs_dabuf_t *bp, /* block buffer */ xfs_dir2_data_unused_t *dup) /* data unused pointer */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr = bp->data; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); /* * Log the first part of the unused entry. */ - xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d), + xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - - 1 - (char *)d)); + 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ xfs_da_log_buf(tp, bp, - (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d), - (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d + + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); } @@ -517,7 +522,7 @@ xfs_dir2_data_make_free( int *needlogp, /* out: log header */ int *needscanp) /* out: regen bestfree */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ char *endptr; /* end of data area */ xfs_mount_t *mp; /* filesystem mount point */ @@ -527,28 +532,29 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ mp = tp->t_mountp; - d = bp->data; + hdr = bp->data; + /* * Figure out where the end of the data area is. */ - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC) - endptr = (char *)d + mp->m_dirblksize; + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + endptr = (char *)hdr + mp->m_dirblksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + btp = xfs_dir2_block_tail_p(mp, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(d->hdr)) { + if (offset > sizeof(*hdr)) { __be16 *tagp; /* tag just before us */ - tagp = (__be16 *)((char *)d + offset) - 1; - prevdup = (xfs_dir2_data_unused_t *)((char *)d + be16_to_cpu(*tagp)); + tagp = (__be16 *)((char *)hdr + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG) prevdup = NULL; } else @@ -557,9 +563,9 @@ xfs_dir2_data_make_free( * If this isn't the end of the block, see if the entry after * us is free. */ - if ((char *)d + offset + len < endptr) { + if ((char *)hdr + offset + len < endptr) { postdup = - (xfs_dir2_data_unused_t *)((char *)d + offset + len); + (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) postdup = NULL; } else @@ -576,21 +582,21 @@ xfs_dir2_data_make_free( /* * See if prevdup and/or postdup are in bestfree table. */ - dfp = xfs_dir2_data_freefind(d, prevdup); - dfp2 = xfs_dir2_data_freefind(d, postdup); + dfp = xfs_dir2_data_freefind(hdr, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, postdup); /* * We need a rescan unless there are exactly 2 free entries * namely our two. Then we know what's happening, otherwise * since the third bestfree is there, there might be more * entries. */ - needscan = (d->hdr.bestfree[2].length != 0); + needscan = (hdr->bestfree[2].length != 0); /* * Fix up the new big freespace. */ be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(prevdup) = - cpu_to_be16((char *)prevdup - (char *)d); + cpu_to_be16((char *)prevdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, prevdup); if (!needscan) { /* @@ -600,18 +606,18 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &d->hdr.bestfree[1]) { - dfp = &d->hdr.bestfree[0]; + if (dfp == &hdr->bestfree[1]) { + dfp = &hdr->bestfree[0]; ASSERT(dfp2 == dfp); - dfp2 = &d->hdr.bestfree[1]; + dfp2 = &hdr->bestfree[1]; } - xfs_dir2_data_freeremove(d, dfp2, needlogp); - xfs_dir2_data_freeremove(d, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); /* * Now insert the new entry. */ - dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp); - ASSERT(dfp == &d->hdr.bestfree[0]); + dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); + ASSERT(dfp == &hdr->bestfree[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -621,10 +627,10 @@ xfs_dir2_data_make_free( * The entry before us is free, merge with it. */ else if (prevdup) { - dfp = xfs_dir2_data_freefind(d, prevdup); + dfp = xfs_dir2_data_freefind(hdr, prevdup); be16_add_cpu(&prevdup->length, len); *xfs_dir2_data_unused_tag_p(prevdup) = - cpu_to_be16((char *)prevdup - (char *)d); + cpu_to_be16((char *)prevdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, prevdup); /* * If the previous entry was in the table, the new entry @@ -632,27 +638,27 @@ xfs_dir2_data_make_free( * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(d->hdr.bestfree[2].length); + be16_to_cpu(hdr->bestfree[2].length); } } /* * The following entry is free, merge with it. */ else if (postdup) { - dfp = xfs_dir2_data_freefind(d, postdup); - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + dfp = xfs_dir2_data_freefind(hdr, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(newdup) = - cpu_to_be16((char *)newdup - (char *)d); + cpu_to_be16((char *)newdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup); /* * If the following entry was in the table, the new entry @@ -660,28 +666,28 @@ xfs_dir2_data_make_free( * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, newdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(d->hdr.bestfree[2].length); + be16_to_cpu(hdr->bestfree[2].length); } } /* * Neither neighbor is free. Make a new entry. */ else { - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len); *xfs_dir2_data_unused_tag_p(newdup) = - cpu_to_be16((char *)newdup - (char *)d); + cpu_to_be16((char *)newdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup); - (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeinsert(hdr, newdup, needlogp); } *needscanp = needscan; } @@ -699,7 +705,7 @@ xfs_dir2_data_use_free( int *needlogp, /* out: need to log header */ int *needscanp) /* out: need regen bestfree */ { - xfs_dir2_data_t *d; /* data block */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ int matchback; /* matches end of freespace */ int matchfront; /* matches start of freespace */ @@ -708,24 +714,24 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + hdr = bp->data; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); - ASSERT(offset >= (char *)dup - (char *)d); - ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); - ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + ASSERT(offset >= (char *)dup - (char *)hdr); + ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); + ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); /* * Look up the entry in the bestfree table. */ - dfp = xfs_dir2_data_freefind(d, dup); + dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(d->hdr.bestfree[2].length)); + ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length)); /* * Check for alignment with front and back of the entry. */ - matchfront = (char *)dup - (char *)d == offset; - matchback = (char *)dup + oldlen - (char *)d == offset + len; + matchfront = (char *)dup - (char *)hdr == offset; + matchback = (char *)dup + oldlen - (char *)hdr == offset + len; ASSERT(*needscanp == 0); needscan = 0; /* @@ -734,9 +740,9 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (d->hdr.bestfree[2].offset != 0); + needscan = (hdr->bestfree[2].offset != 0); if (!needscan) - xfs_dir2_data_freeremove(d, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); } } /* @@ -744,27 +750,27 @@ xfs_dir2_data_use_free( * Make a new entry with the remaining freespace. */ else if (matchfront) { - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(oldlen - len); *xfs_dir2_data_unused_tag_p(newdup) = - cpu_to_be16((char *)newdup - (char *)d); + cpu_to_be16((char *)newdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); /* * If we got inserted at the last slot, * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &d->hdr.bestfree[2]; + needscan = dfp == &hdr->bestfree[2]; } } /* @@ -773,25 +779,25 @@ xfs_dir2_data_use_free( */ else if (matchback) { newdup = dup; - newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = - cpu_to_be16((char *)newdup - (char *)d); + cpu_to_be16((char *)newdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); /* * If we got inserted at the last slot, * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &d->hdr.bestfree[2]; + needscan = dfp == &hdr->bestfree[2]; } } /* @@ -800,15 +806,15 @@ xfs_dir2_data_use_free( */ else { newdup = dup; - newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = - cpu_to_be16((char *)newdup - (char *)d); + cpu_to_be16((char *)newdup - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup); - newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); *xfs_dir2_data_unused_tag_p(newdup2) = - cpu_to_be16((char *)newdup2 - (char *)d); + cpu_to_be16((char *)newdup2 - (char *)hdr); xfs_dir2_data_log_unused(tp, bp, newdup2); /* * If the old entry was in the table, we need to scan @@ -819,13 +825,12 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (d->hdr.bestfree[2].length != 0); + needscan = (hdr->bestfree[2].length != 0); if (!needscan) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup, - needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup2, - needlogp); + xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeinsert(hdr, newdup2, + needlogp); } } } diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h deleted file mode 100644 index efbc290c7fec..000000000000 --- a/fs/xfs/xfs_dir2_data.h +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_DATA_H__ -#define __XFS_DIR2_DATA_H__ - -/* - * Directory format 2, data block structures. - */ - -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_inode; -struct xfs_trans; - -/* - * Constants. - */ -#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */ -#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ -#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) -#define XFS_DIR2_DATA_FREE_TAG 0xffff -#define XFS_DIR2_DATA_FD_COUNT 3 - -/* - * Directory address space divided into sections, - * spaces separated by 32GB. - */ -#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) -#define XFS_DIR2_DATA_SPACE 0 -#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_DATA_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) - -/* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) - -/* - * Structures. - */ - -/* - * Describe a free area in the data block. - * The freespace will be formatted as a xfs_dir2_data_unused_t. - */ -typedef struct xfs_dir2_data_free { - __be16 offset; /* start of freespace */ - __be16 length; /* length of freespace */ -} xfs_dir2_data_free_t; - -/* - * Header for the data blocks. - * Always at the beginning of a directory-sized block. - * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. - */ -typedef struct xfs_dir2_data_hdr { - __be32 magic; /* XFS_DIR2_DATA_MAGIC */ - /* or XFS_DIR2_BLOCK_MAGIC */ - xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; -} xfs_dir2_data_hdr_t; - -/* - * Active entry in a data block. Aligned to 8 bytes. - * Tag appears as the last 2 bytes. - */ -typedef struct xfs_dir2_data_entry { - __be64 inumber; /* inode number */ - __u8 namelen; /* name length */ - __u8 name[1]; /* name bytes, no null */ - /* variable offset */ - __be16 tag; /* starting offset of us */ -} xfs_dir2_data_entry_t; - -/* - * Unused entry in a data block. Aligned to 8 bytes. - * Tag appears as the last 2 bytes. - */ -typedef struct xfs_dir2_data_unused { - __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ - __be16 length; /* total free length */ - /* variable offset */ - __be16 tag; /* starting offset of us */ -} xfs_dir2_data_unused_t; - -typedef union { - xfs_dir2_data_entry_t entry; - xfs_dir2_data_unused_t unused; -} xfs_dir2_data_union_t; - -/* - * Generic data block structure, for xfs_db. - */ -typedef struct xfs_dir2_data { - xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */ - xfs_dir2_data_union_t u[1]; -} xfs_dir2_data_t; - -/* - * Macros. - */ - -/* - * Size of a data entry. - */ -static inline int xfs_dir2_data_entsize(int n) -{ - return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ - (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); -} - -/* - * Pointer to an entry's tag word. - */ -static inline __be16 * -xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); -} - -/* - * Pointer to a freespace's tag word. - */ -static inline __be16 * -xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) -{ - return (__be16 *)((char *)dup + - be16_to_cpu(dup->length) - sizeof(__be16)); -} - -/* - * Function declarations. - */ -#ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); -#else -#define xfs_dir2_data_check(dp,bp) -#endif -extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d, - xfs_dir2_data_unused_t *dup); -extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d, - xfs_dir2_data_unused_t *dup, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, - int *loghead); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, - struct xfs_dabuf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_entry_t *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_unused_t *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, - int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_unused_t *dup, - xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, - int *needscanp); - -#endif /* __XFS_DIR2_DATA_H__ */ diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h new file mode 100644 index 000000000000..07270981f48f --- /dev/null +++ b/fs/xfs/xfs_dir2_format.h @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_FORMAT_H__ +#define __XFS_DIR2_FORMAT_H__ + +/* + * Directory version 2. + * + * There are 4 possible formats: + * - shortform - embedded into the inode + * - single block - data with embedded leaf at the end + * - multiple data blocks, single leaf+freeindex block + * - data blocks, node and leaf blocks (btree), freeindex blocks + * + * Note: many node blocks structures and constants are shared with the attr + * code and defined in xfs_da_btree.h. + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to fit into the + * literal area of the inode. These "shortform" directories consist of a + * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry + * structures. Due the different inode number storage size and the variable + * length name field in the xfs_dir2_sf_entry all these structure are + * variable length, and the accessors in this file should be used to iterate + * over them. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __u8 namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 name[]; /* name, variable size */ + /* + * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a + * variable offset after the name. + */ +} __arch_pack xfs_dir2_sf_entry_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return sizeof(struct xfs_dir2_sf_hdr) - + (i8count == 0) * + (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return get_unaligned_be16(&sfep->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + put_unaligned_be16(off, &sfep->offset.i); +} + +static inline int +xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len) +{ + return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */ + len + /* name */ + (hdr->i8count ? /* ino */ + sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t)); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * Data block structures. + * + * A pure data block looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + * + * In addition to the pure data blocks for the data and node formats, + * most structures are also used for the combined data/freespace "block" + * format below. + */ + +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_DATA_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) + +/* + * Offsets of . and .. in data space (always block 0) + */ +#define XFS_DIR2_DATA_DOT_OFFSET \ + ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr)) +#define XFS_DIR2_DATA_DOTDOT_OFFSET \ + (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) +#define XFS_DIR2_DATA_FIRST_OFFSET \ + (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) + +/* + * Describe a free area in the data block. + * + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ + /* XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * Active entry in a data block. + * + * Aligned to 8 bytes. After the variable length name field there is a + * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[]; /* name bytes, no null */ + /* __be16 tag; */ /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. + * + * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed + * using xfs_dir2_data_unused_tag_p. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +/* + * Size of a data entry. + */ +static inline int xfs_dir2_data_entsize(int n) +{ + return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n + + (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); +} + +/* + * Pointer to an entry's tag word. + */ +static inline __be16 * +xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Leaf block structures. + * + * A pure leaf block looks like the following drawing on disk: + * + * +---------------------------+ + * | xfs_dir2_leaf_hdr_t | + * +---------------------------+ + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_leaf_tail_t | + * +---------------------------+ + * + * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block + * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present + * for directories with separate leaf nodes and free space blocks + * (magic = XFS_DIR2_LEAFN_MAGIC). + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_LEAF_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t ents[]; /* entries */ +} xfs_dir2_leaf_t; + +/* + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +{ + return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +/* + * Get address of the bestcount field in the single-leaf block. + */ +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + mp->m_dirblksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t) + (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & + ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << + (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); +} + +/* + * Free space block defintions for the node format. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_FREE_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp) +{ + return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +/* + * Convert data space db to the corresponding free db. + */ +static inline xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static inline int +xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(mp); +} + +/* + * Single block format. + * + * The single block format looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * | ... | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * +-------------------------------------------------+ + * | xfs_dir2_block_tail_t | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Pointer to the leaf header embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + mp->m_dirblksize)) - 1; +} + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + +#endif /* __XFS_DIR2_FORMAT_H__ */ diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ae891223be90..ca2386d82cdf 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -24,18 +24,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -64,7 +60,7 @@ xfs_dir2_block_to_leaf( { __be16 *bestsp; /* leaf's bestsp entries */ xfs_dablk_t blkno; /* leaf block's bno */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ xfs_dir2_block_tail_t *btp; /* block's tail */ xfs_inode_t *dp; /* incore directory inode */ @@ -101,9 +97,9 @@ xfs_dir2_block_to_leaf( } ASSERT(lbp != NULL); leaf = lbp->data; - block = dbp->data; + hdr = dbp->data; xfs_dir2_data_check(dp, dbp); - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Set the counts in the leaf header. @@ -123,23 +119,23 @@ xfs_dir2_block_to_leaf( * tail be free. */ xfs_dir2_data_make_free(tp, dbp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), - (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize - + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize - (char *)blp), &needlog, &needscan); /* * Fix up the block header, make it a data block. */ - block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); /* * Set up leaf tail and bests table. */ ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); - bestsp[0] = block->hdr.bestfree[0].length; + bestsp[0] = hdr->bestfree[0].length; /* * Log the data header and leaf bests table. */ @@ -152,6 +148,131 @@ xfs_dir2_block_to_leaf( return 0; } +STATIC void +xfs_dir2_leaf_find_stale( + struct xfs_dir2_leaf *leaf, + int index, + int *lowstale, + int *highstale) +{ + /* + * Find the first stale entry before our index, if any. + */ + for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { + if (leaf->ents[*lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + } + + /* + * Find the first stale entry at or after our index, if any. + * Stop if the result would require moving more entries than using + * lowstale. + */ + for (*highstale = index; + *highstale < be16_to_cpu(leaf->hdr.count); + ++*highstale) { + if (leaf->ents[*highstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + if (*lowstale >= 0 && index - *lowstale <= *highstale - index) + break; + } +} + +struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_find_entry( + xfs_dir2_leaf_t *leaf, /* leaf structure */ + int index, /* leaf table position */ + int compact, /* need to compact leaves */ + int lowstale, /* index of prev stale leaf */ + int highstale, /* index of next stale leaf */ + int *lfloglow, /* low leaf logging index */ + int *lfloghigh) /* high leaf logging index */ +{ + if (!leaf->hdr.stale) { + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + + /* + * Now we need to make room to insert the leaf entry. + * + * If there are no stale entries, just insert a hole at index. + */ + lep = &leaf->ents[index]; + if (index < be16_to_cpu(leaf->hdr.count)) + memmove(lep + 1, lep, + (be16_to_cpu(leaf->hdr.count) - index) * + sizeof(*lep)); + + /* + * Record low and high logging indices for the leaf. + */ + *lfloglow = index; + *lfloghigh = be16_to_cpu(leaf->hdr.count); + be16_add_cpu(&leaf->hdr.count, 1); + return lep; + } + + /* + * There are stale entries. + * + * We will use one of them for the new entry. It's probably not at + * the right location, so we'll have to shift some up or down first. + * + * If we didn't compact before, we need to find the nearest stale + * entries before and after our insertion point. + */ + if (compact == 0) + xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == be16_to_cpu(leaf->hdr.count) || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(leaf->ents[lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries up to cover the stale entry and make room + * for the new entry. + */ + if (index - lowstale - 1 > 0) { + memmove(&leaf->ents[lowstale], + &leaf->ents[lowstale + 1], + (index - lowstale - 1) * + sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(lowstale, *lfloglow); + *lfloghigh = MAX(index - 1, *lfloghigh); + be16_add_cpu(&leaf->hdr.stale, -1); + return &leaf->ents[index - 1]; + } + + /* + * The high one is better, so use that one. + */ + ASSERT(highstale - index >= 0); + ASSERT(leaf->ents[highstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries down to cover the stale entry and make room for the + * new entry. + */ + if (highstale - index > 0) { + memmove(&leaf->ents[index + 1], + &leaf->ents[index], + (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(index, *lfloglow); + *lfloghigh = MAX(highstale, *lfloghigh); + be16_add_cpu(&leaf->hdr.stale, -1); + return &leaf->ents[index]; +} + /* * Add an entry to a leaf form directory. */ @@ -161,7 +282,7 @@ xfs_dir2_leaf_addname( { __be16 *bestsp; /* freespace table in leaf */ int compact; /* need to compact leaves */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dabuf_t *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ @@ -225,7 +346,7 @@ xfs_dir2_leaf_addname( continue; i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); - ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); + ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); if (be16_to_cpu(bestsp[i]) >= length) { use_block = i; break; @@ -239,7 +360,8 @@ xfs_dir2_leaf_addname( /* * Remember a block we see that's missing. */ - if (be16_to_cpu(bestsp[i]) == NULLDATAOFF && use_block == -1) + if (bestsp[i] == cpu_to_be16(NULLDATAOFF) && + use_block == -1) use_block = i; else if (be16_to_cpu(bestsp[i]) >= length) { use_block = i; @@ -250,14 +372,17 @@ xfs_dir2_leaf_addname( /* * How many bytes do we need in the leaf block? */ - needbytes = - (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) + - (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0])); + needbytes = 0; + if (!leaf->hdr.stale) + needbytes += sizeof(xfs_dir2_leaf_entry_t); + if (use_block == -1) + needbytes += sizeof(xfs_dir2_data_off_t); + /* * Now kill use_block if it refers to a missing block, so we * can use it as an indication of allocation needed. */ - if (use_block != -1 && be16_to_cpu(bestsp[use_block]) == NULLDATAOFF) + if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF)) use_block = -1; /* * If we don't have enough free bytes but we can make enough @@ -369,8 +494,8 @@ xfs_dir2_leaf_addname( */ else xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - data = dbp->data; - bestsp[use_block] = data->hdr.bestfree[0].length; + hdr = dbp->data; + bestsp[use_block] = hdr->bestfree[0].length; grown = 1; } /* @@ -384,7 +509,7 @@ xfs_dir2_leaf_addname( xfs_da_brelse(tp, lbp); return error; } - data = dbp->data; + hdr = dbp->data; grown = 0; } xfs_dir2_data_check(dp, dbp); @@ -392,14 +517,14 @@ xfs_dir2_leaf_addname( * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* * Mark the initial part of our freespace in use for the new entry. */ xfs_dir2_data_use_free(tp, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* * Initialize our new entry (at last). @@ -409,12 +534,12 @@ xfs_dir2_leaf_addname( dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)data); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); /* * Need to log the data block's header. */ @@ -425,107 +550,15 @@ xfs_dir2_leaf_addname( * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(data->hdr.bestfree[0].length)) { - bestsp[use_block] = data->hdr.bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) { + bestsp[use_block] = hdr->bestfree[0].length; if (!grown) xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); } - /* - * Now we need to make room to insert the leaf entry. - * If there are no stale entries, we just insert a hole at index. - */ - if (!leaf->hdr.stale) { - /* - * lep is still good as the index leaf entry. - */ - if (index < be16_to_cpu(leaf->hdr.count)) - memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); - /* - * Record low and high logging indices for the leaf. - */ - lfloglow = index; - lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); - } - /* - * There are stale entries. - * We will use one of them for the new entry. - * It's probably not at the right location, so we'll have to - * shift some up or down first. - */ - else { - /* - * If we didn't compact before, we need to find the nearest - * stale entries before and after our insertion point. - */ - if (compact == 0) { - /* - * Find the first stale entry before the insertion - * point, if any. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != - XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find the next stale entry at or after the insertion - * point, if any. Stop if we go so far that the - * lowstale entry would be better. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != - XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || - index - lowstale - 1 >= highstale - index); - highstale++) - continue; - } - /* - * If the low one is better, use it. - */ - if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || - index - lowstale - 1 < highstale - index)) { - ASSERT(index - lowstale - 1 >= 0); - ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == - XFS_DIR2_NULL_DATAPTR); - /* - * Copy entries up to cover the stale entry - * and make room for the new entry. - */ - if (index - lowstale - 1 > 0) - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], - (index - lowstale - 1) * sizeof(*lep)); - lep = &leaf->ents[index - 1]; - lfloglow = MIN(lowstale, lfloglow); - lfloghigh = MAX(index - 1, lfloghigh); - } - /* - * The high one is better, so use that one. - */ - else { - ASSERT(highstale - index >= 0); - ASSERT(be32_to_cpu(leaf->ents[highstale].address) == - XFS_DIR2_NULL_DATAPTR); - /* - * Copy entries down to cover the stale entry - * and make room for the new entry. - */ - if (highstale - index > 0) - memmove(&leaf->ents[index + 1], - &leaf->ents[index], - (highstale - index) * sizeof(*lep)); - lep = &leaf->ents[index]; - lfloglow = MIN(index, lfloglow); - lfloghigh = MAX(highstale, lfloghigh); - } - be16_add_cpu(&leaf->hdr.stale, -1); - } + + lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + /* * Fill in the new leaf entry. */ @@ -562,7 +595,7 @@ xfs_dir2_leaf_check( leaf = bp->data; mp = dp->i_mount; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); /* * This value is not restrictive enough. * Should factor in the size of the bests table as well. @@ -582,7 +615,7 @@ xfs_dir2_leaf_check( if (i + 1 < be16_to_cpu(leaf->hdr.count)) ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= be32_to_cpu(leaf->ents[i + 1].hashval)); - if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); @@ -611,7 +644,8 @@ xfs_dir2_leaf_compact( * Compress out the stale entries in place. */ for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + if (leaf->ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -663,24 +697,9 @@ xfs_dir2_leaf_compact_x1( leaf = bp->data; ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); index = *indexp; - /* - * Find the first stale entry before our index, if any. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find the first stale entry at or after our index, if any. - * Stop if the answer would be worse than lowstale. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || index - lowstale > highstale - index); - highstale++) - continue; + + xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + /* * Pick the better of lowstale and highstale. */ @@ -701,7 +720,8 @@ xfs_dir2_leaf_compact_x1( if (index == from) newindex = to; if (from != keepstale && - be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) { + leaf->ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -760,7 +780,7 @@ xfs_dir2_leaf_getdents( int byteoff; /* offset in current block */ xfs_dir2_db_t curdb; /* db for current block */ xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ @@ -1018,23 +1038,23 @@ xfs_dir2_leaf_getdents( else if (curoff > newoff) ASSERT(xfs_dir2_byte_to_db(mp, curoff) == curdb); - data = bp->data; + hdr = bp->data; xfs_dir2_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)&data->u; + ptr = (char *)(hdr + 1); byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += (uint)sizeof(data->hdr); + curoff += (uint)sizeof(*hdr); /* * Skip past entries until we reach our offset. */ else { - while ((char *)ptr - (char *)data < byteoff) { + while ((char *)ptr - (char *)hdr < byteoff) { dup = (xfs_dir2_data_unused_t *)ptr; if (be16_to_cpu(dup->freetag) @@ -1055,8 +1075,8 @@ xfs_dir2_leaf_getdents( curoff = xfs_dir2_db_off_to_byte(mp, xfs_dir2_byte_to_db(mp, curoff), - (char *)ptr - (char *)data); - if (ptr >= (char *)data + mp->m_dirblksize) { + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + mp->m_dirblksize) { continue; } } @@ -1179,7 +1199,7 @@ xfs_dir2_leaf_log_bests( xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; @@ -1202,8 +1222,8 @@ xfs_dir2_leaf_log_ents( xfs_dir2_leaf_t *leaf; /* leaf structure */ leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || - be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); firstlep = &leaf->ents[first]; lastlep = &leaf->ents[last]; xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), @@ -1221,8 +1241,8 @@ xfs_dir2_leaf_log_header( xfs_dir2_leaf_t *leaf; /* leaf structure */ leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || - be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), (uint)(sizeof(leaf->hdr) - 1)); } @@ -1241,7 +1261,7 @@ xfs_dir2_leaf_log_tail( mp = tp->t_mountp; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); @@ -1437,7 +1457,7 @@ xfs_dir2_leaf_removename( xfs_da_args_t *args) /* operation arguments */ { __be16 *bestsp; /* leaf block best freespace */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ xfs_dabuf_t *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry structure */ @@ -1467,7 +1487,7 @@ xfs_dir2_leaf_removename( tp = args->trans; mp = dp->i_mount; leaf = lbp->data; - data = dbp->data; + hdr = dbp->data; xfs_dir2_data_check(dp, dbp); /* * Point to the leaf entry, use that to point to the data entry. @@ -1475,9 +1495,9 @@ xfs_dir2_leaf_removename( lep = &leaf->ents[index]; db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) - ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(data->hdr.bestfree[0].length); + oldbest = be16_to_cpu(hdr->bestfree[0].length); ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); @@ -1485,7 +1505,7 @@ xfs_dir2_leaf_removename( * Mark the former data entry unused. */ xfs_dir2_data_make_free(tp, dbp, - (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. @@ -1499,23 +1519,23 @@ xfs_dir2_leaf_removename( * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); /* * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(data->hdr.bestfree[0].length) != oldbest) { - bestsp[db] = data->hdr.bestfree[0].length; + if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) { + bestsp[db] = hdr->bestfree[0].length; xfs_dir2_leaf_log_bests(tp, lbp, db, db); } xfs_dir2_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(data->hdr.bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(data->hdr)) { + if (be16_to_cpu(hdr->bestfree[0].length) == + mp->m_dirblksize - (uint)sizeof(*hdr)) { ASSERT(db != mp->m_dirdatablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* @@ -1542,7 +1562,7 @@ xfs_dir2_leaf_removename( * Look for the last active entry (i). */ for (i = db - 1; i > 0; i--) { - if (be16_to_cpu(bestsp[i]) != NULLDATAOFF) + if (bestsp[i] != cpu_to_be16(NULLDATAOFF)) break; } /* @@ -1686,9 +1706,6 @@ xfs_dir2_leaf_trim_data( xfs_dir2_db_t db) /* data block number */ { __be16 *bestsp; /* leaf bests table */ -#ifdef DEBUG - xfs_dir2_data_t *data; /* data block structure */ -#endif xfs_dabuf_t *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ @@ -1707,20 +1724,21 @@ xfs_dir2_leaf_trim_data( XFS_DATA_FORK))) { return error; } -#ifdef DEBUG - data = dbp->data; - ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); -#endif - /* this seems to be an error - * data is only valid if DEBUG is defined? - * RMC 09/08/1999 - */ leaf = lbp->data; ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(data->hdr)); + +#ifdef DEBUG +{ + struct xfs_dir2_data_hdr *hdr = dbp->data; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(be16_to_cpu(hdr->bestfree[0].length) == + mp->m_dirblksize - (uint)sizeof(*hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); +} +#endif + /* * Get rid of the data block. */ @@ -1740,6 +1758,20 @@ xfs_dir2_leaf_trim_data( return 0; } +static inline size_t +xfs_dir2_leaf_size( + struct xfs_dir2_leaf_hdr *hdr, + int counts) +{ + int entries; + + entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale); + return sizeof(xfs_dir2_leaf_hdr_t) + + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); +} + /* * Convert node form directory to leaf form directory. * The root of the node form dir needs to already be a LEAFN block. @@ -1810,7 +1842,7 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Read the freespace block. */ @@ -1819,20 +1851,19 @@ xfs_dir2_node_to_leaf( return error; } free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); + /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if ((uint)sizeof(leaf->hdr) + - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)) * (uint)sizeof(leaf->ents[0]) + - be32_to_cpu(free->hdr.nvalid) * (uint)sizeof(leaf->bests[0]) + - (uint)sizeof(leaf->tail) > - mp->m_dirblksize) { + if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > + mp->m_dirblksize) { xfs_da_brelse(tp, fbp); return 0; } + /* * If the leaf has any stale entries in it, compress them out. * The compact routine will log the header. @@ -1851,7 +1882,7 @@ xfs_dir2_node_to_leaf( * Set up the leaf bests table. */ memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); + be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t)); xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); xfs_dir2_leaf_log_tail(tp, lbp); xfs_dir2_leaf_check(dp, lbp); diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h deleted file mode 100644 index 6c9539f06987..000000000000 --- a/fs/xfs/xfs_dir2_leaf.h +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_LEAF_H__ -#define __XFS_DIR2_LEAF_H__ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * Offset of the leaf/node space. First block in this space - * is the btree root. - */ -#define XFS_DIR2_LEAF_SPACE 1 -#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_LEAF_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) - -/* - * Offset in data space of a data entry. - */ -typedef __uint32_t xfs_dir2_dataptr_t; -#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) -#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) - -/* - * Leaf block header. - */ -typedef struct xfs_dir2_leaf_hdr { - xfs_da_blkinfo_t info; /* header for da routines */ - __be16 count; /* count of entries */ - __be16 stale; /* count of stale entries */ -} xfs_dir2_leaf_hdr_t; - -/* - * Leaf block entry. - */ -typedef struct xfs_dir2_leaf_entry { - __be32 hashval; /* hash value of name */ - __be32 address; /* address of data entry */ -} xfs_dir2_leaf_entry_t; - -/* - * Leaf block tail. - */ -typedef struct xfs_dir2_leaf_tail { - __be32 bestcount; -} xfs_dir2_leaf_tail_t; - -/* - * Leaf block. - * bests and tail are at the end of the block for single-leaf only - * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC). - */ -typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[1]; /* entries */ - /* ... */ - xfs_dir2_data_off_t bests[1]; /* best free counts */ - xfs_dir2_leaf_tail_t tail; /* leaf tail */ -} xfs_dir2_leaf_t; - -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) -{ - return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / - (uint)sizeof(xfs_dir2_leaf_entry_t)); -} - -/* - * Get address of the bestcount field in the single-leaf block. - */ -static inline xfs_dir2_leaf_tail_t * -xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) -{ - return (xfs_dir2_leaf_tail_t *) - ((char *)(lp) + (mp)->m_dirblksize - - (uint)sizeof(xfs_dir2_leaf_tail_t)); -} - -/* - * Get address of the bests array in the single-leaf block. - */ -static inline __be16 * -xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) -{ - return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); -} - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t)((by) >> \ - ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)((by) & \ - ((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)(db) << \ - ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o); -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); -} - -/* - * Function declarations. - */ -extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, - struct xfs_dabuf *dbp); -extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_dabuf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, - int *lowstalep, int *highstalep, - int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, - size_t bufsize, xfs_off_t *offset, - filldir_t filldir); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_dabuf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); -extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); -extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); -extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, - struct xfs_dabuf *lbp, xfs_dir2_db_t db); -extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); - -#endif /* __XFS_DIR2_LEAF_H__ */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index a0aab7d3294f..084b3247d636 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -23,18 +23,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -73,7 +69,7 @@ xfs_dir2_free_log_bests( xfs_dir2_free_t *free; /* freespace structure */ free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); xfs_da_log_buf(tp, bp, (uint)((char *)&free->bests[first] - (char *)free), (uint)((char *)&free->bests[last] - (char *)free + @@ -91,7 +87,7 @@ xfs_dir2_free_log_header( xfs_dir2_free_t *free; /* freespace structure */ free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); } @@ -244,89 +240,13 @@ xfs_dir2_leafn_add( lfloglow = be16_to_cpu(leaf->hdr.count); lfloghigh = -1; } - /* - * No stale entries, just insert a space for the new entry. - */ - if (!leaf->hdr.stale) { - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) - memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); - lfloglow = index; - lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); - } - /* - * There are stale entries. We'll use one for the new entry. - */ - else { - /* - * If we didn't do a compact then we need to figure out - * which stale entry will be used. - */ - if (compact == 0) { - /* - * Find first stale entry before our insertion point. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != - XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find next stale entry after insertion point. - * Stop looking if the answer would be worse than - * lowstale already found. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != - XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || - index - lowstale - 1 >= highstale - index); - highstale++) - continue; - } - /* - * Using the low stale entry. - * Shift entries up toward the stale slot. - */ - if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || - index - lowstale - 1 < highstale - index)) { - ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == - XFS_DIR2_NULL_DATAPTR); - ASSERT(index - lowstale - 1 >= 0); - if (index - lowstale - 1 > 0) - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], - (index - lowstale - 1) * sizeof(*lep)); - lep = &leaf->ents[index - 1]; - lfloglow = MIN(lowstale, lfloglow); - lfloghigh = MAX(index - 1, lfloghigh); - } - /* - * Using the high stale entry. - * Shift entries down toward the stale slot. - */ - else { - ASSERT(be32_to_cpu(leaf->ents[highstale].address) == - XFS_DIR2_NULL_DATAPTR); - ASSERT(highstale - index >= 0); - if (highstale - index > 0) - memmove(&leaf->ents[index + 1], - &leaf->ents[index], - (highstale - index) * sizeof(*lep)); - lep = &leaf->ents[index]; - lfloglow = MIN(index, lfloglow); - lfloghigh = MAX(highstale, lfloghigh); - } - be16_add_cpu(&leaf->hdr.stale, -1); - } + /* * Insert the new entry, log everything. */ + lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + lep->hashval = cpu_to_be32(args->hashval); lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); @@ -352,14 +272,14 @@ xfs_dir2_leafn_check( leaf = bp->data; mp = dp->i_mount; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { if (i + 1 < be16_to_cpu(leaf->hdr.count)) { ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= be32_to_cpu(leaf->ents[i + 1].hashval)); } - if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); @@ -378,7 +298,7 @@ xfs_dir2_leafn_lasthash( xfs_dir2_leaf_t *leaf; /* leaf structure */ leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); if (!leaf->hdr.count) @@ -417,7 +337,7 @@ xfs_dir2_leafn_lookup_for_addname( tp = args->trans; mp = dp->i_mount; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); #endif @@ -434,7 +354,7 @@ xfs_dir2_leafn_lookup_for_addname( curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; free = curbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); /* @@ -488,7 +408,7 @@ xfs_dir2_leafn_lookup_for_addname( ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); ASSERT((be32_to_cpu(free->hdr.firstdb) % - XFS_DIR2_MAX_FREE_BESTS(mp)) == 0); + xfs_dir2_free_max_bests(mp)) == 0); ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + be32_to_cpu(free->hdr.nvalid)); @@ -500,7 +420,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * If it has room, return it. */ - if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { + if (unlikely(free->bests[fi] == + cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) @@ -561,7 +482,7 @@ xfs_dir2_leafn_lookup_for_entry( tp = args->trans; mp = dp->i_mount; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); #endif @@ -742,7 +663,8 @@ xfs_dir2_leafn_moveents( int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (be32_to_cpu(leaf_s->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + if (leaf_s->ents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -789,8 +711,8 @@ xfs_dir2_leafn_order( leaf1 = leaf1_bp->data; leaf2 = leaf2_bp->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (be16_to_cpu(leaf1->hdr.count) > 0 && be16_to_cpu(leaf2->hdr.count) > 0 && (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || @@ -918,7 +840,7 @@ xfs_dir2_leafn_remove( xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ xfs_dabuf_t *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ @@ -938,7 +860,7 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Point to the entry we're removing. */ @@ -963,9 +885,9 @@ xfs_dir2_leafn_remove( * in the data block in case it changes. */ dbp = dblk->bp; - data = dbp->data; - dep = (xfs_dir2_data_entry_t *)((char *)data + off); - longest = be16_to_cpu(data->hdr.bestfree[0].length); + hdr = dbp->data; + dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); + longest = be16_to_cpu(hdr->bestfree[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); @@ -974,7 +896,7 @@ xfs_dir2_leafn_remove( * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); xfs_dir2_data_check(dp, dbp); @@ -982,7 +904,7 @@ xfs_dir2_leafn_remove( * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(data->hdr.bestfree[0].length)) { + if (longest < be16_to_cpu(hdr->bestfree[0].length)) { int error; /* error return value */ xfs_dabuf_t *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ @@ -1000,27 +922,27 @@ xfs_dir2_leafn_remove( return error; } free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == - XFS_DIR2_MAX_FREE_BESTS(mp) * + xfs_dir2_free_max_bests(mp) * (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); /* * Calculate which entry we need to fix. */ findex = xfs_dir2_db_to_fdindex(mp, db); - longest = be16_to_cpu(data->hdr.bestfree[0].length); + longest = be16_to_cpu(hdr->bestfree[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) { + if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) { /* * Try to punch out the data block. */ error = xfs_dir2_shrink_inode(args, db, dbp); if (error == 0) { dblk->bp = NULL; - data = NULL; + hdr = NULL; } /* * We can get ENOSPC if there's no space reservation. @@ -1036,7 +958,7 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (data == NULL) { + if (hdr == NULL) { /* * One less used entry in the free table. */ @@ -1052,7 +974,8 @@ xfs_dir2_leafn_remove( int i; /* free entry index */ for (i = findex - 1; - i >= 0 && be16_to_cpu(free->bests[i]) == NULLDATAOFF; + i >= 0 && + free->bests[i] == cpu_to_be16(NULLDATAOFF); i--) continue; free->hdr.nvalid = cpu_to_be32(i + 1); @@ -1209,7 +1132,7 @@ xfs_dir2_leafn_toosmall( */ blk = &state->path.blk[state->path.active - 1]; info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); @@ -1268,7 +1191,7 @@ xfs_dir2_leafn_toosmall( count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes = state->blocksize - (state->blocksize >> 2); leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes -= count * (uint)sizeof(leaf->ents[0]); /* @@ -1327,8 +1250,8 @@ xfs_dir2_leafn_unbalance( ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); drop_leaf = drop_blk->bp->data; save_leaf = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * If there are any stale leaf entries, take this opportunity * to purge them. @@ -1432,7 +1355,7 @@ xfs_dir2_node_addname_int( xfs_da_args_t *args, /* operation arguments */ xfs_da_state_blk_t *fblk) /* optional freespace block */ { - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t dbno; /* data block number */ xfs_dabuf_t *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ @@ -1469,7 +1392,7 @@ xfs_dir2_node_addname_int( */ ifbno = fblk->blkno; free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; /* * This means the free entry showed that the data block had @@ -1553,7 +1476,7 @@ xfs_dir2_node_addname_int( continue; } free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } /* @@ -1680,12 +1603,12 @@ xfs_dir2_node_addname_int( free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); free->hdr.firstdb = cpu_to_be32( (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - XFS_DIR2_MAX_FREE_BESTS(mp)); + xfs_dir2_free_max_bests(mp)); free->hdr.nvalid = 0; free->hdr.nused = 0; } else { free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } /* @@ -1697,7 +1620,7 @@ xfs_dir2_node_addname_int( * freespace block, extend that table. */ if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp)); + ASSERT(findex < xfs_dir2_free_max_bests(mp)); free->hdr.nvalid = cpu_to_be32(findex + 1); /* * Tag new entry so nused will go up. @@ -1708,7 +1631,7 @@ xfs_dir2_node_addname_int( * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) { + if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) { be32_add_cpu(&free->hdr.nused, 1); xfs_dir2_free_log_header(tp, fbp); } @@ -1717,8 +1640,8 @@ xfs_dir2_node_addname_int( * We haven't allocated the data entry yet so this will * change again. */ - data = dbp->data; - free->bests[findex] = data->hdr.bestfree[0].length; + hdr = dbp->data; + free->bests[findex] = hdr->bestfree[0].length; logfree = 1; } /* @@ -1743,21 +1666,21 @@ xfs_dir2_node_addname_int( xfs_da_buf_done(fbp); return error; } - data = dbp->data; + hdr = dbp->data; logfree = 0; } - ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) >= length); + ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. */ xfs_dir2_data_use_free(tp, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* * Fill in the new entry and log it. @@ -1767,13 +1690,13 @@ xfs_dir2_node_addname_int( dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); tagp = xfs_dir2_data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)data); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, dbp, dep); /* * Rescan the block for bestfree if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog); + xfs_dir2_data_freescan(mp, hdr, &needlog); /* * Log the data block header if needed. */ @@ -1782,8 +1705,8 @@ xfs_dir2_node_addname_int( /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(data->hdr.bestfree[0].length)) { - free->bests[findex] = data->hdr.bestfree[0].length; + if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) { + free->bests[findex] = hdr->bestfree[0].length; logfree = 1; } /* @@ -1933,7 +1856,7 @@ xfs_dir2_node_replace( xfs_da_args_t *args) /* operation arguments */ { xfs_da_state_blk_t *blk; /* leaf block */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry changed */ int error; /* error return value */ int i; /* btree level */ @@ -1977,10 +1900,10 @@ xfs_dir2_node_replace( /* * Point to the data entry. */ - data = state->extrablk.bp->data; - ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); + hdr = state->extrablk.bp->data; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) - ((char *)data + + ((char *)hdr + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* @@ -2044,7 +1967,7 @@ xfs_dir2_node_trim_free( return 0; } free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* * If there are used entries, there's nothing to do. */ diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h deleted file mode 100644 index 82dfe7147195..000000000000 --- a/fs/xfs/xfs_dir2_node.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_NODE_H__ -#define __XFS_DIR2_NODE_H__ - -/* - * Directory version 2, btree node format structures - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_da_state; -struct xfs_da_state_blk; -struct xfs_inode; -struct xfs_trans; - -/* - * Offset of the freespace index. - */ -#define XFS_DIR2_FREE_SPACE 2 -#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_FREE_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) - -#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ - -typedef struct xfs_dir2_free_hdr { - __be32 magic; /* XFS_DIR2_FREE_MAGIC */ - __be32 firstdb; /* db of first entry */ - __be32 nvalid; /* count of valid entries */ - __be32 nused; /* count of used entries */ -} xfs_dir2_free_hdr_t; - -typedef struct xfs_dir2_free { - xfs_dir2_free_hdr_t hdr; /* block header */ - __be16 bests[1]; /* best free counts */ - /* unused entries are -1 */ -} xfs_dir2_free_t; - -#define XFS_DIR2_MAX_FREE_BESTS(mp) \ - (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \ - (uint)sizeof(xfs_dir2_data_off_t)) - -/* - * Convert data space db to the corresponding free db. - */ -static inline xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static inline int -xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); -} - -extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); -extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, - struct xfs_da_args *args, int *indexp, - struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); -extern int xfs_dir2_leafn_split(struct xfs_da_state *state, - struct xfs_da_state_blk *oldblk, - struct xfs_da_state_blk *newblk); -extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); -extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, - struct xfs_da_state_blk *drop_blk, - struct xfs_da_state_blk *save_blk); -extern int xfs_dir2_node_addname(struct xfs_da_args *args); -extern int xfs_dir2_node_lookup(struct xfs_da_args *args); -extern int xfs_dir2_node_removename(struct xfs_da_args *args); -extern int xfs_dir2_node_replace(struct xfs_da_args *args); -extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, - int *rvalp); - -#endif /* __XFS_DIR2_NODE_H__ */ diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h new file mode 100644 index 000000000000..067f403ecf8a --- /dev/null +++ b/fs/xfs/xfs_dir2_priv.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_PRIV_H__ +#define __XFS_DIR2_PRIV_H__ + +/* xfs_dir2.c */ +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_dabuf *bp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); + +/* xfs_dir2_block.c */ +extern int xfs_dir2_block_addname(struct xfs_da_args *args); +extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_block_lookup(struct xfs_da_args *args); +extern int xfs_dir2_block_removename(struct xfs_da_args *args); +extern int xfs_dir2_block_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, + struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); + +/* xfs_dir2_data.c */ +#ifdef DEBUG +extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +#else +#define xfs_dir2_data_check(dp,bp) +#endif +extern struct xfs_dir2_data_free * +xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_unused *dup, int *loghead); +extern void xfs_dir2_data_freescan(struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_dabuf **bpp); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_trans *tp, + struct xfs_dabuf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); + +/* xfs_dir2_leaf.c */ +extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, + struct xfs_dabuf *dbp); +extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); +extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, + struct xfs_dabuf *bp); +extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); +extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, + size_t bufsize, xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_dabuf **bpp, int magic); +extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); +extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, + struct xfs_dabuf *bp); +extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); +extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_dabuf *lbp); +extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, + struct xfs_dabuf *lbp, xfs_dir2_db_t db); +extern struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, + int lowstale, int highstale, + int *lfloglow, int *lfloghigh); +extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +/* xfs_dir2_node.c */ +extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, + struct xfs_dabuf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); +extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +extern int xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); +extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); +extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +extern int xfs_dir2_node_addname(struct xfs_da_args *args); +extern int xfs_dir2_node_lookup(struct xfs_da_args *args); +extern int xfs_dir2_node_removename(struct xfs_da_args *args); +extern int xfs_dir2_node_replace(struct xfs_da_args *args); +extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); + +/* xfs_dir2_sf.c */ +extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); +extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep); +extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_sf_addname(struct xfs_da_args *args); +extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); +extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_sf_removename(struct xfs_da_args *args); +extern int xfs_dir2_sf_replace(struct xfs_da_args *args); + +#endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index b1bae6b1eed9..79d05e84e296 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -23,18 +23,16 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_error.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" +#include "xfs_dir2.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_trace.h" /* @@ -59,6 +57,82 @@ static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); #endif /* XFS_BIG_INUMS */ +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. The inode numbers may only be accessed + * through the helpers below. + */ +static xfs_dir2_inou_t * +xfs_dir2_sfe_inop( + struct xfs_dir2_sf_entry *sfep) +{ + return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]; +} + +xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep)); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino); +} + /* * Given a block directory (dp/block), calculate its size as a shortform (sf) * directory and a header for the sf directory, if it will fit it the @@ -68,7 +142,7 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args); int /* size for sf form */ xfs_dir2_block_sfsize( xfs_inode_t *dp, /* incore inode pointer */ - xfs_dir2_block_t *block, /* block directory data */ + xfs_dir2_data_hdr_t *hdr, /* block directory data */ xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ { xfs_dir2_dataptr_t addr; /* data entry address */ @@ -88,7 +162,7 @@ xfs_dir2_block_sfsize( mp = dp->i_mount; count = i8count = namelen = 0; - btp = xfs_dir2_block_tail_p(mp, block); + btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -101,7 +175,7 @@ xfs_dir2_block_sfsize( * Calculate the pointer to the entry at hand. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -138,7 +212,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + xfs_dir2_sf_put_parent_ino(sfhp, parent); return size; } @@ -153,7 +227,7 @@ xfs_dir2_block_to_sf( int size, /* shortform directory size */ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ @@ -164,8 +238,7 @@ xfs_dir2_block_to_sf( xfs_mount_t *mp; /* filesystem mount point */ char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_ino_t temp; + xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ trace_xfs_dir2_block_to_sf(args); @@ -176,13 +249,14 @@ xfs_dir2_block_to_sf( * Make a copy of the block data, so we can shrink the inode * and add local data. */ - block = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(block, bp->data, mp->m_dirblksize); + hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); + memcpy(hdr, bp->data, mp->m_dirblksize); logflags = XFS_ILOG_CORE; if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { ASSERT(error != ENOSPC); goto out; } + /* * The buffer is now unconditionally gone, whether * xfs_dir2_shrink_inode worked or not. @@ -198,14 +272,14 @@ xfs_dir2_block_to_sf( /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); dp->i_d.di_size = size; /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(mp, block); - ptr = (char *)block->u; + btp = xfs_dir2_block_tail_p(mp, hdr); + ptr = (char *)(hdr + 1); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -233,7 +307,7 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); + xfs_dir2_sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ @@ -241,11 +315,11 @@ xfs_dir2_block_to_sf( sfep->namelen = dep->namelen; xfs_dir2_sf_put_offset(sfep, (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)block)); + ((char *)dep - (char *)hdr)); memcpy(sfep->name, dep->name, dep->namelen); - temp = be64_to_cpu(dep->inumber); - xfs_dir2_sf_put_inumber(sfp, &temp, - xfs_dir2_sf_inumberp(sfep)); + xfs_dir2_sfe_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); } ptr += xfs_dir2_data_entsize(dep->namelen); @@ -254,7 +328,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(block); + kmem_free(hdr); return error; } @@ -277,7 +351,7 @@ xfs_dir2_sf_addname( xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int old_isize; /* di_size before adding name */ int pick; /* which algorithm to use */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ trace_xfs_dir2_sf_addname(args); @@ -294,19 +368,19 @@ xfs_dir2_sf_addname( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. */ - add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); + add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen); incr_isize = add_entsize; objchange = 0; #if XFS_BIG_INUMS /* * Do we have to change to 8 byte inodes? */ - if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { /* * Yes, adjust the entry size and the total size. */ @@ -314,7 +388,7 @@ xfs_dir2_sf_addname( (uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t); incr_isize += - (sfp->hdr.count + 2) * + (sfp->count + 2) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); objchange = 1; @@ -384,21 +458,21 @@ xfs_dir2_sf_addname_easy( { int byteoff; /* byte offset in sf dir */ xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ dp = args->dp; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen), + xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -406,15 +480,14 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sf_put_inumber(sfp, &args->inumber, - xfs_dir2_sf_inumberp(sfep)); + xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); /* * Update the header and inode. */ - sfp->hdr.count++; + sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) - sfp->hdr.i8count++; + sfp->i8count++; #endif dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); @@ -444,19 +517,19 @@ xfs_dir2_sf_addname_hard( xfs_dir2_data_aoff_t offset; /* current offset value */ int old_isize; /* previous di_size */ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ - xfs_dir2_sf_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ - xfs_dir2_sf_t *sfp; /* new shortform dir */ + xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ /* * Copy the old directory to the stack buffer. */ dp = args->dp; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; buf = kmem_alloc(old_isize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)buf; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); /* * Loop over the old directory finding the place we're going @@ -485,7 +558,7 @@ xfs_dir2_sf_addname_hard( /* * Reset the pointer since the buffer was reallocated. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Copy the first part of the directory, including the header. */ @@ -498,12 +571,11 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sf_put_inumber(sfp, &args->inumber, - xfs_dir2_sf_inumberp(sfep)); - sfp->hdr.count++; + xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) - sfp->hdr.i8count++; + sfp->i8count++; #endif /* * If there's more left to copy, do that. @@ -537,14 +609,14 @@ xfs_dir2_sf_addname_pick( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ int size; /* entry's data size */ int used; /* data bytes used */ dp = args->dp; mp = dp->i_mount; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(args->namelen); offset = XFS_DIR2_DATA_FIRST_OFFSET; sfep = xfs_dir2_sf_firstentry(sfp); @@ -554,7 +626,7 @@ xfs_dir2_sf_addname_pick( * Keep track of data offset and whether we've seen a place * to insert the new entry. */ - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + @@ -566,7 +638,7 @@ xfs_dir2_sf_addname_pick( * was a data block (block form directory). */ used = offset + - (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t); /* * If it won't fit in a block form then we can't insert it, @@ -612,30 +684,30 @@ xfs_dir2_sf_check( xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ dp = args->dp; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = XFS_DIR2_DATA_FIRST_OFFSET; - ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->hdr.count; + i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); + ino = xfs_dir2_sfe_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + xfs_dir2_data_entsize(sfep->namelen); } - ASSERT(i8count == sfp->hdr.i8count); + ASSERT(i8count == sfp->i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + - (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t) <= dp->i_mount->m_dirblksize); } @@ -651,7 +723,7 @@ xfs_dir2_sf_create( { xfs_inode_t *dp; /* incore directory inode */ int i8count; /* parent inode is an 8-byte number */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ int size; /* directory size */ trace_xfs_dir2_sf_create(args); @@ -681,13 +753,13 @@ xfs_dir2_sf_create( /* * Fill in the header, */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - sfp->hdr.i8count = i8count; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp->i8count = i8count; /* * Now can put in the inode number, since i8count is set. */ - xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent); - sfp->hdr.count = 0; + xfs_dir2_sf_put_parent_ino(sfp, pino); + sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); @@ -705,7 +777,7 @@ xfs_dir2_sf_getdents( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; @@ -724,9 +796,9 @@ xfs_dir2_sf_getdents( ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * If the block number in the offset is out of range, we're done. @@ -759,7 +831,7 @@ xfs_dir2_sf_getdents( * Put .. entry unless we're starting past it. */ if (*offset <= dotdot_offset) { - ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_parent_ino(sfp); if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { *offset = dotdot_offset & 0x7fffffff; return 0; @@ -770,7 +842,7 @@ xfs_dir2_sf_getdents( * Loop while there are more entries and put'ing works. */ sfep = xfs_dir2_sf_firstentry(sfp); - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, xfs_dir2_sf_get_offset(sfep)); @@ -779,7 +851,7 @@ xfs_dir2_sf_getdents( continue; } - ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); + ino = xfs_dir2_sfe_get_ino(sfp, sfep); if (filldir(dirent, (char *)sfep->name, sfep->namelen, off & 0x7fffffff, ino, DT_UNKNOWN)) { *offset = off & 0x7fffffff; @@ -805,7 +877,7 @@ xfs_dir2_sf_lookup( int i; /* entry index */ int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ @@ -824,8 +896,8 @@ xfs_dir2_sf_lookup( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . */ @@ -839,7 +911,7 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + args->inumber = xfs_dir2_sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; return XFS_ERROR(EEXIST); } @@ -847,7 +919,7 @@ xfs_dir2_sf_lookup( * Loop over all the entries trying to match ours. */ ci_sfep = NULL; - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode @@ -858,8 +930,7 @@ xfs_dir2_sf_lookup( sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep)); + args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); ci_sfep = sfep; @@ -891,7 +962,7 @@ xfs_dir2_sf_removename( int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_removename(args); @@ -908,32 +979,31 @@ xfs_dir2_sf_removename( } ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. * Find the one we're deleting. */ - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep)) == - args->inumber); + ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) == + args->inumber); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) + if (i == sfp->count) return XFS_ERROR(ENOENT); /* * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); + entsize = xfs_dir2_sf_entsize(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -944,22 +1014,22 @@ xfs_dir2_sf_removename( /* * Fix up the header and file size. */ - sfp->hdr.count--; + sfp->count--; dp->i_d.di_size = newsize; /* * Reallocate, making it smaller. */ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; #if XFS_BIG_INUMS /* * Are we changing inode number size? */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { - if (sfp->hdr.i8count == 1) + if (sfp->i8count == 1) xfs_dir2_sf_toino4(args); else - sfp->hdr.i8count--; + sfp->i8count--; } #endif xfs_dir2_sf_check(args); @@ -983,7 +1053,7 @@ xfs_dir2_sf_replace( int i8elevated; /* sf_toino8 set i8count=1 */ #endif xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_replace(args); @@ -999,19 +1069,19 @@ xfs_dir2_sf_replace( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); #if XFS_BIG_INUMS /* * New inode number is large, and need to convert to 8-byte inodes. */ - if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { int error; /* error return value */ int newsize; /* new inode size */ newsize = dp->i_df.if_bytes + - (sfp->hdr.count + 1) * + (sfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); /* @@ -1029,7 +1099,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; } else i8elevated = 0; #endif @@ -1040,34 +1110,32 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + ino = xfs_dir2_sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); #endif - xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent); + xfs_dir2_sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->hdr.count; + i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sf_get_inumber(sfp, - xfs_dir2_sf_inumberp(sfep)); + ino = xfs_dir2_sfe_get_ino(sfp, sfep); ASSERT(args->inumber != ino); #endif - xfs_dir2_sf_put_inumber(sfp, &args->inumber, - xfs_dir2_sf_inumberp(sfep)); + xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) { + if (i == sfp->count) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); #if XFS_BIG_INUMS if (i8elevated) @@ -1085,10 +1153,10 @@ xfs_dir2_sf_replace( /* * And the old count was one, so need to convert to small. */ - if (sfp->hdr.i8count == 1) + if (sfp->i8count == 1) xfs_dir2_sf_toino4(args); else - sfp->hdr.i8count--; + sfp->i8count--; } /* * See if the old number was small, the new number is large. @@ -1099,9 +1167,9 @@ xfs_dir2_sf_replace( * add to the i8count unless we just converted to 8-byte * inodes (which does an implied i8count = 1) */ - ASSERT(sfp->hdr.i8count != 0); + ASSERT(sfp->i8count != 0); if (!i8elevated) - sfp->hdr.i8count++; + sfp->i8count++; } #endif xfs_dir2_sf_check(args); @@ -1121,13 +1189,12 @@ xfs_dir2_sf_toino4( char *buf; /* old dir's buffer */ xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ - xfs_ino_t ino; /* entry inode number */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_t *oldsfp; /* old sf directory */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ - xfs_dir2_sf_t *sfp; /* new sf directory */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ trace_xfs_dir2_sf_toino4(args); @@ -1140,44 +1207,42 @@ xfs_dir2_sf_toino4( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsfp->hdr.i8count == 1); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* * Compute the new inode size. */ newsize = oldsize - - (oldsfp->hdr.count + 1) * + (oldsfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* * Reset our pointers, the data has moved. */ - oldsfp = (xfs_dir2_sf_t *)buf; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Fill in the new header. */ - sfp->hdr.count = oldsfp->hdr.count; - sfp->hdr.i8count = 0; - ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); - xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); + sfp->count = oldsfp->count; + sfp->i8count = 0; + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); - i < sfp->hdr.count; + i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = xfs_dir2_sf_get_inumber(oldsfp, - xfs_dir2_sf_inumberp(oldsfep)); - xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); + xfs_dir2_sfe_put_ino(sfp, sfep, + xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); } /* * Clean up the inode. @@ -1199,13 +1264,12 @@ xfs_dir2_sf_toino8( char *buf; /* old dir's buffer */ xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ - xfs_ino_t ino; /* entry inode number */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_t *oldsfp; /* old sf directory */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ - xfs_dir2_sf_t *sfp; /* new sf directory */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ trace_xfs_dir2_sf_toino8(args); @@ -1218,44 +1282,42 @@ xfs_dir2_sf_toino8( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsfp->hdr.i8count == 0); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* * Compute the new inode size. */ newsize = oldsize + - (oldsfp->hdr.count + 1) * + (oldsfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* * Reset our pointers, the data has moved. */ - oldsfp = (xfs_dir2_sf_t *)buf; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Fill in the new header. */ - sfp->hdr.count = oldsfp->hdr.count; - sfp->hdr.i8count = 1; - ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); - xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); + sfp->count = oldsfp->count; + sfp->i8count = 1; + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); - i < sfp->hdr.count; + i < sfp->count; i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = xfs_dir2_sf_get_inumber(oldsfp, - xfs_dir2_sf_inumberp(oldsfep)); - xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); + xfs_dir2_sfe_put_ino(sfp, sfep, + xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h deleted file mode 100644 index 6ac44b550d39..000000000000 --- a/fs/xfs/xfs_dir2_sf.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_SF_H__ -#define __XFS_DIR2_SF_H__ - -/* - * Directory layout when stored internal to an inode. - * - * Small directories are packed as tightly as possible so as to - * fit into the literal area of the inode. - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_block; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; - -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; -#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) - -/* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* - * The parent directory has a dedicated field, and the self-pointer must - * be calculated on the fly. - * - * Entries are packed toward the top as tightly as possible. The header - * and the elements must be memcpy'd out into a work area to get correct - * alignment for the inode number fields. - */ -typedef struct xfs_dir2_sf_hdr { - __uint8_t count; /* count of entries */ - __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; - -typedef struct xfs_dir2_sf_entry { - __uint8_t namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ - __uint8_t name[1]; /* name, variable size */ - xfs_dir2_inou_t inumber; /* inode number, var. offset */ -} __arch_pack xfs_dir2_sf_entry_t; - -typedef struct xfs_dir2_sf { - xfs_dir2_sf_hdr_t hdr; /* shortform header */ - xfs_dir2_sf_entry_t list[1]; /* shortform entries */ -} xfs_dir2_sf_t; - -static inline int xfs_dir2_sf_hdr_size(int i8count) -{ - return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ - ((i8count) == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) -{ - return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; -} - -static inline xfs_intino_t -xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) -{ - return ((sfp)->hdr.i8count == 0 ? \ - (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \ - (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); -} - -static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, - xfs_dir2_inou_t *to) -{ - if ((sfp)->hdr.i8count == 0) - XFS_PUT_DIR_INO4(*(from), (to)->i4); - else - XFS_PUT_DIR_INO8(*(from), (to)->i8); -} - -static inline xfs_dir2_data_aoff_t -xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) -{ - return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); -} - -static inline void -xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) -{ - INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); -} - -static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) -{ - return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ - ((sfp)->hdr.i8count == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -static inline int -xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) -{ - return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \ - ((sfp)->hdr.i8count == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) -{ - return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count))); -} - -static inline xfs_dir2_sf_entry_t * -xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) -{ - return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep))); -} - -/* - * Functions. - */ -extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, - struct xfs_dir2_block *block, - xfs_dir2_sf_hdr_t *sfhp); -extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, - int size, xfs_dir2_sf_hdr_t *sfhp); -extern int xfs_dir2_sf_addname(struct xfs_da_args *args); -extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); -extern int xfs_dir2_sf_removename(struct xfs_da_args *args); -extern int xfs_dir2_sf_replace(struct xfs_da_args *args); - -#endif /* __XFS_DIR2_SF_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 8f6fc1a96386..c13fed8c394a 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -249,6 +249,11 @@ typedef struct xfs_fsop_resblks { #define XFS_MAX_LOG_BYTES \ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) +/* Used for sanity checks on superblock */ +#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks) +#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \ + (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) + /* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 84ebeec16642..dd5628bd8d0b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -683,7 +683,7 @@ xfs_dialloc( return 0; } agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); } else { /* * Continue where we left off before. In this case, we @@ -691,7 +691,7 @@ xfs_dialloc( */ agbp = *IO_agbp; agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); ASSERT(be32_to_cpu(agi->agi_freecount) > 0); } mp = tp->t_mountp; @@ -775,7 +775,7 @@ nextag: if (error) goto nextag; agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); } /* * Here with an allocation group that has a free inode. @@ -944,7 +944,7 @@ nextag: * See if the most recently allocated block has any free. */ newino: - if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) @@ -1085,7 +1085,7 @@ xfs_difree( return error; } agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); ASSERT(agbno < be32_to_cpu(agi->agi_length)); /* * Initialize the cursor. @@ -1438,7 +1438,7 @@ xfs_ialloc_log_agi( xfs_agi_t *agi; /* allocation group header */ agi = XFS_BUF_TO_AGI(bp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif /* * Compute byte offsets for the first and last fields. @@ -1492,7 +1492,7 @@ xfs_read_agi( /* * Validate the magic number of the agi block. */ - agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && be32_to_cpu(agi->agi_seqno) == agno; if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 16921f55c542..c6a75815aea0 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -31,7 +31,6 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -205,72 +204,6 @@ xfs_inobt_recs_inorder( } #endif /* DEBUG */ -#ifdef XFS_BTREE_TRACE -ktrace_t *xfs_inobt_trace_buf; - -STATIC void -xfs_inobt_trace_enter( - struct xfs_btree_cur *cur, - const char *func, - char *s, - int type, - int line, - __psunsigned_t a0, - __psunsigned_t a1, - __psunsigned_t a2, - __psunsigned_t a3, - __psunsigned_t a4, - __psunsigned_t a5, - __psunsigned_t a6, - __psunsigned_t a7, - __psunsigned_t a8, - __psunsigned_t a9, - __psunsigned_t a10) -{ - ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type, - (void *)func, (void *)s, NULL, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); -} - -STATIC void -xfs_inobt_trace_cursor( - struct xfs_btree_cur *cur, - __uint32_t *s0, - __uint64_t *l0, - __uint64_t *l1) -{ - *s0 = cur->bc_private.a.agno; - *l0 = cur->bc_rec.i.ir_startino; - *l1 = cur->bc_rec.i.ir_free; -} - -STATIC void -xfs_inobt_trace_key( - struct xfs_btree_cur *cur, - union xfs_btree_key *key, - __uint64_t *l0, - __uint64_t *l1) -{ - *l0 = be32_to_cpu(key->inobt.ir_startino); - *l1 = 0; -} - -STATIC void -xfs_inobt_trace_record( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - __uint64_t *l0, - __uint64_t *l1, - __uint64_t *l2) -{ - *l0 = be32_to_cpu(rec->inobt.ir_startino); - *l1 = be32_to_cpu(rec->inobt.ir_freecount); - *l2 = be64_to_cpu(rec->inobt.ir_free); -} -#endif /* XFS_BTREE_TRACE */ - static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), @@ -286,18 +219,10 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, #endif - -#ifdef XFS_BTREE_TRACE - .trace_enter = xfs_inobt_trace_enter, - .trace_cursor = xfs_inobt_trace_cursor, - .trace_key = xfs_inobt_trace_key, - .trace_record = xfs_inobt_trace_record, -#endif }; /* diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3631783b2b53..7759812c1bbe 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -38,7 +38,6 @@ #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_btree_trace.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a098a20ca63e..3cc21ddf9f7e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,7 +37,6 @@ #include "xfs_buf_item.h" #include "xfs_inode_item.h" #include "xfs_btree.h" -#include "xfs_btree_trace.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" @@ -52,7 +51,7 @@ kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; /* - * Used in xfs_itruncate(). This is the maximum number of extents + * Used in xfs_itruncate_extents(). This is the maximum number of extents * freed from a file in a single transaction. */ #define XFS_ITRUNC_MAX_EXTENTS 2 @@ -167,7 +166,7 @@ xfs_imap_to_bp( dip = (xfs_dinode_t *)xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); - di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, @@ -802,7 +801,7 @@ xfs_iread( * If we got something that isn't an inode it means someone * (nfs or dmi) has a stale handle. */ - if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { #ifdef DEBUG xfs_alert(mp, "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", @@ -1179,15 +1178,15 @@ xfs_ialloc( * at least do it for regular files. */ #ifdef DEBUG -void +STATIC void xfs_isize_check( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_fsize_t isize) + struct xfs_inode *ip, + xfs_fsize_t isize) { - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t map_first; + int nimaps; + xfs_bmbt_irec_t imaps[2]; if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) return; @@ -1214,168 +1213,14 @@ xfs_isize_check( ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); } +#else /* DEBUG */ +#define xfs_isize_check(ip, isize) #endif /* DEBUG */ /* - * Calculate the last possible buffered byte in a file. This must - * include data that was buffered beyond the EOF by the write code. - * This also needs to deal with overflowing the xfs_fsize_t type - * which can happen for sizes near the limit. - * - * We also need to take into account any blocks beyond the EOF. It - * may be the case that they were buffered by a write which failed. - * In that case the pages will still be in memory, but the inode size - * will never have been updated. - */ -STATIC xfs_fsize_t -xfs_file_last_byte( - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - xfs_fsize_t last_byte; - xfs_fileoff_t last_block; - xfs_fileoff_t size_last_block; - int error; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); - - mp = ip->i_mount; - /* - * Only check for blocks beyond the EOF if the extents have - * been read in. This eliminates the need for the inode lock, - * and it also saves us from looking when it really isn't - * necessary. - */ - if (ip->i_df.if_flags & XFS_IFEXTENTS) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (error) { - last_block = 0; - } - } else { - last_block = 0; - } - size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); - last_block = XFS_FILEOFF_MAX(last_block, size_last_block); - - last_byte = XFS_FSB_TO_B(mp, last_block); - if (last_byte < 0) { - return XFS_MAXIOFFSET(mp); - } - last_byte += (1 << mp->m_writeio_log); - if (last_byte < 0) { - return XFS_MAXIOFFSET(mp); - } - return last_byte; -} - -/* - * Start the truncation of the file to new_size. The new size - * must be smaller than the current size. This routine will - * clear the buffer and page caches of file data in the removed - * range, and xfs_itruncate_finish() will remove the underlying - * disk blocks. - * - * The inode must have its I/O lock locked EXCLUSIVELY, and it - * must NOT have the inode lock held at all. This is because we're - * calling into the buffer/page cache code and we can't hold the - * inode lock when we do so. - * - * We need to wait for any direct I/Os in flight to complete before we - * proceed with the truncate. This is needed to prevent the extents - * being read or written by the direct I/Os from being removed while the - * I/O is in flight as there is no other method of synchronising - * direct I/O with the truncate operation. Also, because we hold - * the IOLOCK in exclusive mode, we prevent new direct I/Os from being - * started until the truncate completes and drops the lock. Essentially, - * the xfs_ioend_wait() call forms an I/O barrier that provides strict - * ordering between direct I/Os and the truncate operation. - * - * The flags parameter can have either the value XFS_ITRUNC_DEFINITE - * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used - * in the case that the caller is locking things out of order and - * may not be able to call xfs_itruncate_finish() with the inode lock - * held without dropping the I/O lock. If the caller must drop the - * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() - * must be called again with all the same restrictions as the initial - * call. - */ -int -xfs_itruncate_start( - xfs_inode_t *ip, - uint flags, - xfs_fsize_t new_size) -{ - xfs_fsize_t last_byte; - xfs_off_t toss_start; - xfs_mount_t *mp; - int error = 0; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT((new_size == 0) || (new_size <= ip->i_size)); - ASSERT((flags == XFS_ITRUNC_DEFINITE) || - (flags == XFS_ITRUNC_MAYBE)); - - mp = ip->i_mount; - - /* wait for the completion of any pending DIOs */ - if (new_size == 0 || new_size < ip->i_size) - xfs_ioend_wait(ip); - - /* - * Call toss_pages or flushinval_pages to get rid of pages - * overlapping the region being removed. We have to use - * the less efficient flushinval_pages in the case that the - * caller may not be able to finish the truncate without - * dropping the inode's I/O lock. Make sure - * to catch any pages brought in by buffers overlapping - * the EOF by searching out beyond the isize by our - * block size. We round new_size up to a block boundary - * so that we don't toss things on the same block as - * new_size but before it. - * - * Before calling toss_page or flushinval_pages, make sure to - * call remapf() over the same region if the file is mapped. - * This frees up mapped file references to the pages in the - * given range and for the flushinval_pages case it ensures - * that we get the latest mapped changes flushed out. - */ - toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - toss_start = XFS_FSB_TO_B(mp, toss_start); - if (toss_start < 0) { - /* - * The place to start tossing is beyond our maximum - * file size, so there is no way that the data extended - * out there. - */ - return 0; - } - last_byte = xfs_file_last_byte(ip); - trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte); - if (last_byte > toss_start) { - if (flags & XFS_ITRUNC_DEFINITE) { - xfs_tosspages(ip, toss_start, - -1, FI_REMAPF_LOCKED); - } else { - error = xfs_flushinval_pages(ip, toss_start, - -1, FI_REMAPF_LOCKED); - } - } - -#ifdef DEBUG - if (new_size == 0) { - ASSERT(VN_CACHED(VFS_I(ip)) == 0); - } -#endif - return error; -} - -/* - * Shrink the file to the given new_size. The new size must be smaller than - * the current size. This will free up the underlying blocks in the removed - * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). + * Free up the underlying blocks past new_size. The new size must be smaller + * than the current size. This routine can be used both for the attribute and + * data fork, and does not modify the inode size, which is left to the caller. * * The transaction passed to this routine must have made a permanent log * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the @@ -1387,31 +1232,6 @@ xfs_itruncate_start( * will be "held" within the returned transaction. This routine does NOT * require any disk space to be reserved for it within the transaction. * - * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it - * indicates the fork which is to be truncated. For the attribute fork we only - * support truncation to size 0. - * - * We use the sync parameter to indicate whether or not the first transaction - * we perform might have to be synchronous. For the attr fork, it needs to be - * so if the unlink of the inode is not yet known to be permanent in the log. - * This keeps us from freeing and reusing the blocks of the attribute fork - * before the unlink of the inode becomes permanent. - * - * For the data fork, we normally have to run synchronously if we're being - * called out of the inactive path or we're being called out of the create path - * where we're truncating an existing file. Either way, the truncate needs to - * be sync so blocks don't reappear in the file with altered data in case of a - * crash. wsync filesystems can run the first case async because anything that - * shrinks the inode has to run sync so by the time we're called here from - * inactive, the inode size is permanently set to 0. - * - * Calls from the truncate path always need to be sync unless we're in a wsync - * filesystem and the file has already been unlinked. - * - * The caller is responsible for correctly setting the sync parameter. It gets - * too hard for us to guess here which path we're being called out of just - * based on inode state. - * * If we get an error, we must return with the inode locked and linked into the * current transaction. This keeps things simple for the higher level code, * because it always knows that the inode is locked and held in the transaction @@ -1419,124 +1239,30 @@ xfs_itruncate_start( * dirty on error so that transactions can be easily aborted if possible. */ int -xfs_itruncate_finish( - xfs_trans_t **tp, - xfs_inode_t *ip, - xfs_fsize_t new_size, - int fork, - int sync) +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) { - xfs_fsblock_t first_block; - xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; - xfs_filblks_t unmap_len=0; - xfs_mount_t *mp; - xfs_trans_t *ntp; - int done; - int committed; - xfs_bmap_free_t free_list; - int error; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp = *tpp; + struct xfs_trans *ntp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len; + int committed; + int error = 0; + int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT((new_size == 0) || (new_size <= ip->i_size)); - ASSERT(*tp != NULL); - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - ASSERT(ip->i_transp == *tp); + ASSERT(new_size <= ip->i_size); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); - - - ntp = *tp; - mp = (ntp)->t_mountp; - ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); - - /* - * We only support truncating the entire attribute fork. - */ - if (fork == XFS_ATTR_FORK) { - new_size = 0LL; - } - first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - trace_xfs_itruncate_finish_start(ip, new_size); - - /* - * The first thing we do is set the size to new_size permanently - * on disk. This way we don't have to worry about anyone ever - * being able to look at the data being freed even in the face - * of a crash. What we're getting around here is the case where - * we free a block, it is allocated to another file, it is written - * to, and then we crash. If the new data gets written to the - * file but the log buffers containing the free and reallocation - * don't, then we'd end up with garbage in the blocks being freed. - * As long as we make the new_size permanent before actually - * freeing any blocks it doesn't matter if they get written to. - * - * The callers must signal into us whether or not the size - * setting here must be synchronous. There are a few cases - * where it doesn't have to be synchronous. Those cases - * occur if the file is unlinked and we know the unlink is - * permanent or if the blocks being truncated are guaranteed - * to be beyond the inode eof (regardless of the link count) - * and the eof value is permanent. Both of these cases occur - * only on wsync-mounted filesystems. In those cases, we're - * guaranteed that no user will ever see the data in the blocks - * that are being truncated so the truncate can run async. - * In the free beyond eof case, the file may wind up with - * more blocks allocated to it than it needs if we crash - * and that won't get fixed until the next time the file - * is re-opened and closed but that's ok as that shouldn't - * be too many blocks. - * - * However, we can't just make all wsync xactions run async - * because there's one call out of the create path that needs - * to run sync where it's truncating an existing file to size - * 0 whose size is > 0. - * - * It's probably possible to come up with a test in this - * routine that would correctly distinguish all the above - * cases from the values of the function parameters and the - * inode state but for sanity's sake, I've decided to let the - * layers above just tell us. It's simpler to correctly figure - * out in the layer above exactly under what conditions we - * can run async and I think it's easier for others read and - * follow the logic in case something has to be changed. - * cscope is your friend -- rcc. - * - * The attribute fork is much simpler. - * - * For the attribute fork we allow the caller to tell us whether - * the unlink of the inode that led to this call is yet permanent - * in the on disk log. If it is not and we will be freeing extents - * in this inode then we make the first transaction synchronous - * to make sure that the unlink is permanent by the time we free - * the blocks. - */ - if (fork == XFS_DATA_FORK) { - if (ip->i_d.di_nextents > 0) { - /* - * If we are not changing the file size then do - * not update the on-disk file size - we may be - * called from xfs_inactive_free_eofblocks(). If we - * update the on-disk file size and then the system - * crashes before the contents of the file are - * flushed to disk then the files may be full of - * holes (ie NULL files bug). - */ - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); - } - } - } else if (sync) { - ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); - if (ip->i_d.di_anextents > 0) - xfs_trans_set_sync(ntp); - } - ASSERT(fork == XFS_DATA_FORK || - (fork == XFS_ATTR_FORK && - ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || - (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); /* * Since it is possible for space to become allocated beyond @@ -1547,128 +1273,142 @@ xfs_itruncate_finish( * beyond the maximum file size (ie it is the same as last_block), * then there is nothing to do. */ + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - ASSERT(first_unmap_block <= last_block); - done = 0; - if (last_block == first_unmap_block) { - done = 1; - } else { - unmap_len = last_block - first_unmap_block + 1; - } + if (first_unmap_block == last_block) + return 0; + + ASSERT(first_unmap_block < last_block); + unmap_len = last_block - first_unmap_block + 1; while (!done) { - /* - * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() - * will tell us whether it freed the entire range or - * not. If this is a synchronous mount (wsync), - * then we can tell bunmapi to keep all the - * transactions asynchronous since the unlink - * transaction that made this inode inactive has - * already hit the disk. There's no danger of - * the freed blocks being reused, there being a - * crash, and the reused blocks suddenly reappearing - * in this file with garbage in them once recovery - * runs. - */ xfs_bmap_init(&free_list, &first_block); - error = xfs_bunmapi(ntp, ip, + error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, - xfs_bmapi_aflag(fork), + xfs_bmapi_aflag(whichfork), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, &done); - if (error) { - /* - * If the bunmapi call encounters an error, - * return to the caller where the transaction - * can be properly aborted. We just need to - * make sure we're not holding any resources - * that we were not when we came in. - */ - xfs_bmap_cancel(&free_list); - return error; - } + if (error) + goto out_bmap_cancel; /* * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(tp, &free_list, &committed); - ntp = *tp; + error = xfs_bmap_finish(&tp, &free_list, &committed); if (committed) - xfs_trans_ijoin(ntp, ip); - - if (error) { - /* - * If the bmap finish call encounters an error, return - * to the caller where the transaction can be properly - * aborted. We just need to make sure we're not - * holding any resources that we were not when we came - * in. - * - * Aborting from this point might lose some blocks in - * the file system, but oh well. - */ - xfs_bmap_cancel(&free_list); - return error; - } + xfs_trans_ijoin(tp, ip); + if (error) + goto out_bmap_cancel; if (committed) { /* * Mark the inode dirty so it will be logged and * moved forward in the log as part of every commit. */ - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - ntp = xfs_trans_dup(ntp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; - xfs_trans_ijoin(ntp, ip); + xfs_trans_ijoin(tp, ip); if (error) - return error; + goto out; + /* - * transaction commit worked ok so we can drop the extra ticket + * Transaction commit worked ok so we can drop the extra ticket * reference that we gained in xfs_trans_dup() */ - xfs_log_ticket_put(ntp->t_ticket); - error = xfs_trans_reserve(ntp, 0, + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); if (error) - return error; + goto out; } + +out: + *tpp = tp; + return error; +out_bmap_cancel: /* - * Only update the size in the case of the data fork, but - * always re-log the inode so that our permanent transaction - * can keep on rolling it forward in the log. + * If the bunmapi call encounters an error, return to the caller where + * the transaction can be properly aborted. We just need to make sure + * we're not holding any resources that we were not when we came in. */ - if (fork == XFS_DATA_FORK) { - xfs_isize_check(mp, ip, new_size); + xfs_bmap_cancel(&free_list); + goto out; +} + +int +xfs_itruncate_data( + struct xfs_trans **tpp, + struct xfs_inode *ip, + xfs_fsize_t new_size) +{ + int error; + + trace_xfs_itruncate_data_start(ip, new_size); + + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new_size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + if (ip->i_d.di_nextents > 0) { /* - * If we are not changing the file size then do - * not update the on-disk file size - we may be - * called from xfs_inactive_free_eofblocks(). If we - * update the on-disk file size and then the system - * crashes before the contents of the file are - * flushed to disk then the files may be full of - * holes (ie NULL files bug). + * If we are not changing the file size then do not update + * the on-disk file size - we may be called from + * xfs_inactive_free_eofblocks(). If we update the on-disk + * file size and then the system crashes before the contents + * of the file are flushed to disk then the files may be + * full of holes (ie NULL files bug). */ if (ip->i_size != new_size) { ip->i_d.di_size = new_size; ip->i_size = new_size; + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); } } - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); - ASSERT((new_size != 0) || - (fork == XFS_ATTR_FORK) || - (ip->i_delayed_blks == 0)); - ASSERT((new_size != 0) || - (fork == XFS_ATTR_FORK) || - (ip->i_d.di_nextents == 0)); - trace_xfs_itruncate_finish_end(ip, new_size); + + error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); + if (error) + return error; + + /* + * If we are not changing the file size then do not update the on-disk + * file size - we may be called from xfs_inactive_free_eofblocks(). + * If we update the on-disk file size and then the system crashes + * before the contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + xfs_isize_check(ip, new_size); + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + } + + ASSERT(new_size != 0 || ip->i_delayed_blks == 0); + ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); + + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_data_end(ip, new_size); return 0; } @@ -1694,7 +1434,6 @@ xfs_iunlink( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_mode != 0); - ASSERT(ip->i_transp == tp); mp = tp->t_mountp; @@ -1717,7 +1456,7 @@ xfs_iunlink( ASSERT(agi->agi_unlinked[bucket_index]); ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { + if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* * There is already another inode in the bucket we need * to add ourselves to. Add us at the front of the list. @@ -1728,8 +1467,7 @@ xfs_iunlink( if (error) return error; - ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); - /* both on-disk, don't endian flip twice */ + ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1794,7 +1532,7 @@ xfs_iunlink_remove( agino = XFS_INO_TO_AGINO(mp, ip->i_ino); ASSERT(agino != 0); bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); + ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); ASSERT(agi->agi_unlinked[bucket_index]); if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { @@ -1959,7 +1697,7 @@ xfs_ifree_cluster( * stale first, we will not attempt to lock them in the loop * below as the XFS_ISTALE flag will be set. */ - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + lip = bp->b_fspriv; while (lip) { if (lip->li_type == XFS_LI_INODE) { iip = (xfs_inode_log_item_t *)lip; @@ -2086,7 +1824,6 @@ xfs_ifree( xfs_buf_t *ibp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_transp == tp); ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); @@ -2733,7 +2470,7 @@ cluster_corrupt_out: * mark the buffer as an error and call them. Otherwise * mark it as stale and brelse. */ - if (XFS_BUF_IODONE_FUNC(bp)) { + if (bp->b_iodone) { XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); XFS_BUF_ERROR(bp,EIO); @@ -2920,7 +2657,7 @@ xfs_iflush_int( */ xfs_synchronize_times(ip); - if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, + if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", @@ -3073,8 +2810,8 @@ xfs_iflush_int( */ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); } else { /* * We're flushing an inode which is not in the AIL and has diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 964cfea77686..a97644ab945a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -241,7 +241,6 @@ typedef struct xfs_inode { xfs_ifork_t i_df; /* data fork */ /* Transaction and locking information. */ - struct xfs_trans *i_transp; /* ptr to owning transaction*/ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ @@ -457,16 +456,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) extern struct lock_class_key xfs_iolock_reclaimable; -/* - * Flags for xfs_itruncate_start(). - */ -#define XFS_ITRUNC_DEFINITE 0x1 -#define XFS_ITRUNC_MAYBE 0x2 - -#define XFS_ITRUNC_FLAGS \ - { XFS_ITRUNC_DEFINITE, "DEFINITE" }, \ - { XFS_ITRUNC_MAYBE, "MAYBE" } - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -501,9 +490,10 @@ uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); -int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); -int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, - xfs_fsize_t, int, int); +int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, + int, xfs_fsize_t); +int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, + xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); @@ -579,13 +569,6 @@ void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -#ifdef DEBUG -void xfs_isize_check(struct xfs_mount *, struct xfs_inode *, - xfs_fsize_t); -#else /* DEBUG */ -#define xfs_isize_check(mp, ip, isize) -#endif /* DEBUG */ - #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b1e88d56069c..588406dc6a35 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -632,13 +632,8 @@ xfs_inode_item_unlock( struct xfs_inode *ip = iip->ili_inode; unsigned short lock_flags; - ASSERT(iip->ili_inode->i_itemp != NULL); - ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - - /* - * Clear the transaction pointer in the inode. - */ - ip->i_transp = NULL; + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * If the inode needed a separate buffer with which to log @@ -664,8 +659,8 @@ xfs_inode_item_unlock( lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; if (lock_flags) { - xfs_iunlock(iip->ili_inode, lock_flags); - IRELE(iip->ili_inode); + xfs_iunlock(ip, lock_flags); + IRELE(ip); } } @@ -879,7 +874,7 @@ xfs_iflush_done( * Scan the buffer IO completions for other inodes being completed and * attach them to the current inode log item. */ - blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + blip = bp->b_fspriv; prev = NULL; while (blip != NULL) { if (lip->li_cb != xfs_iflush_done) { @@ -891,7 +886,7 @@ xfs_iflush_done( /* remove from list */ next = blip->li_bio_list; if (!prev) { - XFS_BUF_SET_FSPRIVATE(bp, next); + bp->b_fspriv = next; } else { prev->li_bio_list = next; } diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index b8e4ee4e89a4..b253c0ea5bec 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -28,17 +28,6 @@ typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ -/* - * Useful inode bits for this kernel. - * Used in some places where having 64-bits in the 32-bit kernels - * costs too much. - */ -#if XFS_BIG_INUMS -typedef xfs_ino_t xfs_intino_t; -#else -typedef __uint32_t xfs_intino_t; -#endif - #define NULLFSINO ((xfs_ino_t)-1) #define NULLAGINO ((xfs_agino_t)-1) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 41d5b8f2bf92..06ff8437ed8e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -871,15 +871,9 @@ xlog_space_left( void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog; - xlog_t *l; - int aborted; - - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - aborted = 0; - l = iclog->ic_log; + xlog_in_core_t *iclog = bp->b_fspriv; + xlog_t *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. @@ -1056,10 +1050,9 @@ xlog_alloc_log(xfs_mount_t *mp, bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); if (!bp) goto out_free_log; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + bp->b_iodone = xlog_iodone; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1090,10 +1083,8 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_iclog_size, 0); if (!bp) goto out_free_iclog; - if (!XFS_BUF_CPSEMA(bp)) - ASSERT(0); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG @@ -1118,7 +1109,7 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); - ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); + ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1254,9 +1245,8 @@ STATIC int xlog_bdstrat( struct xfs_buf *bp) { - struct xlog_in_core *iclog; + struct xlog_in_core *iclog = bp->b_fspriv; - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); if (iclog->ic_state & XLOG_STATE_IOERROR) { XFS_BUF_ERROR(bp, EIO); XFS_BUF_STALE(bp); @@ -1269,7 +1259,6 @@ xlog_bdstrat( return 0; } - bp->b_flags |= _XBF_RUN_QUEUES; xfs_buf_iorequest(bp); return 0; } @@ -1351,8 +1340,6 @@ xlog_sync(xlog_t *log, } bp = iclog->ic_bp; - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); @@ -1366,22 +1353,28 @@ xlog_sync(xlog_t *log, iclog->ic_bwritecnt = 1; } XFS_BUF_SET_COUNT(bp, count); - XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); - bp->b_flags |= XBF_LOG_BUFFER; + bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + bp->b_flags |= XBF_FUA; + /* - * If we have an external log device, flush the data device - * before flushing the log to make sure all meta data - * written back from the AIL actually made it to disk - * before writing out the new log tail LSN in the log buffer. + * Flush the data device before flushing the log to make + * sure all meta data written back from the AIL actually made + * it to disk before stamping the new log tail LSN into the + * log buffer. For an external log we need to issue the + * flush explicitly, and unfortunately synchronously here; + * for an internal log we can simply use the block layer + * state machine for preflushes. */ if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - XFS_BUF_ORDERED(bp); + else + bp->b_flags |= XBF_FLUSH; } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); @@ -1404,19 +1397,16 @@ xlog_sync(xlog_t *log, } if (split) { bp = iclog->ic_log->l_xbuf; - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == - (unsigned long)1); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); - XFS_BUF_SET_FSPRIVATE(bp, iclog); + bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); - bp->b_flags |= XBF_LOG_BUFFER; + bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - XFS_BUF_ORDERED(bp); + bp->b_flags |= XBF_FUA; dptr = XFS_BUF_PTR(bp); /* * Bump the cycle numbers at the start of each block @@ -3521,13 +3511,13 @@ xlog_verify_iclog(xlog_t *log, spin_unlock(&log->l_icloglock); /* check log magic numbers */ - if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) + if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); ptr = (xfs_caddr_t) &iclog->ic_header; for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; ptr += BBSIZE) { - if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 04142caedb2b..8fe4206de057 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -91,6 +91,8 @@ xlog_get_bp( xlog_t *log, int nbblks) { + struct xfs_buf *bp; + if (!xlog_buf_bbcount_valid(log, nbblks)) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); @@ -118,8 +120,10 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return xfs_buf_get_uncached(log->l_mp->m_logdev_targp, - BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + if (bp) + xfs_buf_unlock(bp); + return bp; } STATIC void @@ -264,7 +268,7 @@ xlog_bwrite( XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_HOLD(bp); - XFS_BUF_PSEMA(bp, PRIBIO); + xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); @@ -300,14 +304,14 @@ xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { xfs_warn(mp, "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); @@ -333,7 +337,7 @@ xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); if (uuid_is_nil(&head->h_fs_uuid)) { /* @@ -367,7 +371,7 @@ xlog_recover_iodone( xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -534,7 +538,7 @@ xlog_find_verify_log_record( head = (xlog_rec_header_t *)offset; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; if (!smallmem) @@ -916,7 +920,7 @@ xlog_find_tail( if (error) goto done; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 1; break; } @@ -933,8 +937,8 @@ xlog_find_tail( if (error) goto done; - if (XLOG_HEADER_MAGIC_NUM == - be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 2; break; } @@ -1947,7 +1951,7 @@ xfs_qm_dqcheck( * This is all fine; things are still consistent, and we haven't lost * any quota information. Just don't complain about bad dquot blks. */ - if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { if (flags & XFS_QMOPT_DOWARN) xfs_alert(mp, "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", @@ -2174,7 +2178,7 @@ xlog_recover_buffer_pass2( error = xfs_bwrite(mp, bp); } else { ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; xfs_bdwrite(mp, bp); } @@ -2238,7 +2242,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", @@ -2434,7 +2438,7 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; xfs_bdwrite(mp, bp); error: if (need_free) @@ -2556,7 +2560,7 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; xfs_bdwrite(mp, bp); return (0); @@ -3295,7 +3299,7 @@ xlog_valid_rec_header( { int hlen; - if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b49b82363d20..7f25245da289 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -348,7 +348,7 @@ xfs_mount_validate_sb( } /* - * More sanity checking. These were stolen directly from + * More sanity checking. Most of these were stolen directly from * xfs_repair. */ if (unlikely( @@ -371,23 +371,13 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { if (loud) - xfs_warn(mp, "SB sanity check 1 failed"); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Sanity check AG count, size fields against data size field - */ - if (unlikely( - sbp->sb_dblocks == 0 || - sbp->sb_dblocks > - (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || - sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * - sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { - if (loud) - xfs_warn(mp, "SB sanity check 2 failed"); + XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -864,7 +854,8 @@ xfs_update_alignment(xfs_mount_t *mp) if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check 1 failed"); + xfs_warn(mp, "alignment check failed: " + "(sunit/swidth vs. blocksize)"); return XFS_ERROR(EINVAL); } mp->m_dalign = mp->m_swidth = 0; @@ -875,6 +866,8 @@ xfs_update_alignment(xfs_mount_t *mp) mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { if (mp->m_flags & XFS_MOUNT_RETERR) { + xfs_warn(mp, "alignment check failed: " + "(sunit/swidth vs. ag size)"); return XFS_ERROR(EINVAL); } xfs_warn(mp, @@ -889,8 +882,8 @@ xfs_update_alignment(xfs_mount_t *mp) mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, - "stripe alignment turned off: sunit(%d) less than bsize(%d)", + xfs_warn(mp, "alignment check failed: " + "sunit(%d) less than bsize(%d)", mp->m_dalign, mp->m_blockmask +1); return XFS_ERROR(EINVAL); @@ -1096,10 +1089,6 @@ xfs_mount_reset_sbqflags( if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; -#ifdef QUOTADEBUG - xfs_notice(mp, "Writing superblock quota changes"); -#endif - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, XFS_DEFAULT_LOG_COUNT); @@ -1532,7 +1521,7 @@ xfs_unmountfs( xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); - error = xfs_log_sbcount(mp, 1); + error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); @@ -1568,18 +1557,14 @@ xfs_fs_writable(xfs_mount_t *mp) /* * xfs_log_sbcount * - * Called either periodically to keep the on disk superblock values - * roughly up to date or from unmount to make sure the values are - * correct on a clean unmount. + * Sync the superblock counters to disk. * * Note this code can be called during the process of freezing, so - * we may need to use the transaction allocator which does not not + * we may need to use the transaction allocator which does not * block when the transaction subsystem is in its frozen state. */ int -xfs_log_sbcount( - xfs_mount_t *mp, - uint sync) +xfs_log_sbcount(xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -1605,8 +1590,7 @@ xfs_log_sbcount( } xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); - if (sync) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); return error; } @@ -1941,22 +1925,19 @@ unwind: * the superblock buffer if it can be locked without sleeping. * If it can't then we'll return NULL. */ -xfs_buf_t * +struct xfs_buf * xfs_getsb( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - xfs_buf_t *bp; + struct xfs_buf *bp = mp->m_sb_bp; - ASSERT(mp->m_sb_bp != NULL); - bp = mp->m_sb_bp; - if (flags & XBF_TRYLOCK) { - if (!XFS_BUF_CPSEMA(bp)) { + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) return NULL; - } - } else { - XFS_BUF_PSEMA(bp, PRIBIO); + xfs_buf_lock(bp); } + XFS_BUF_HOLD(bp); ASSERT(XFS_BUF_ISDONE(bp)); return bp; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 3d68bb267c5f..bb24dac42a25 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -371,7 +371,7 @@ typedef struct xfs_mod_sb { int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; -extern int xfs_log_sbcount(xfs_mount_t *, uint); +extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c83f63b33aae..efc147f0e9b6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1426,6 +1426,7 @@ xfs_trans_committed( static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t commit_lsn) @@ -1434,7 +1435,7 @@ xfs_log_item_batch_insert( spin_lock(&ailp->xa_lock); /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ - xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn); + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) IOP_UNPIN(log_items[i], 0); @@ -1452,6 +1453,13 @@ xfs_log_item_batch_insert( * as an iclog write error even though we haven't started any IO yet. Hence in * this case all we need to do is IOP_COMMITTED processing, followed by an * IOP_UNPIN(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. */ void xfs_trans_committed_bulk( @@ -1463,8 +1471,13 @@ xfs_trans_committed_bulk( #define LOG_ITEM_BATCH_SIZE 32 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; int i = 0; + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + /* unpin all the log items */ for (lv = log_vector; lv; lv = lv->lv_next ) { struct xfs_log_item *lip = lv->lv_item; @@ -1493,7 +1506,9 @@ xfs_trans_committed_bulk( /* * Not a bulk update option due to unusual item_lsn. * Push into AIL immediately, rechecking the lsn once - * we have the ail lock. Then unpin the item. + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. */ spin_lock(&ailp->xa_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) @@ -1507,7 +1522,7 @@ xfs_trans_committed_bulk( /* Item is a candidate for bulk AIL insert. */ log_items[i++] = lv->lv_item; if (i >= LOG_ITEM_BATCH_SIZE) { - xfs_log_item_batch_insert(ailp, log_items, + xfs_log_item_batch_insert(ailp, &cur, log_items, LOG_ITEM_BATCH_SIZE, commit_lsn); i = 0; } @@ -1515,7 +1530,11 @@ xfs_trans_committed_bulk( /* make sure we insert the remainder! */ if (i) - xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn); + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 5fc2380092c8..43233e92f0f6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -163,17 +163,11 @@ xfs_ail_max_lsn( } /* - * AIL traversal cursor initialisation. - * - * The cursor keeps track of where our current traversal is up - * to by tracking the next Æ£tem in the list for us. However, for - * this to be safe, removing an object from the AIL needs to invalidate - * any cursor that points to it. hence the traversal cursor needs to - * be linked to the struct xfs_ail so that deletion can search all the - * active cursors for invalidation. - * - * We don't link the push cursor because it is embedded in the struct - * xfs_ail and hence easily findable. + * The cursor keeps track of where our current traversal is up to by tracking + * the next item in the list for us. However, for this to be safe, removing an + * object from the AIL needs to invalidate any cursor that points to it. hence + * the traversal cursor needs to be linked to the struct xfs_ail so that + * deletion can search all the active cursors for invalidation. */ STATIC void xfs_trans_ail_cursor_init( @@ -181,31 +175,12 @@ xfs_trans_ail_cursor_init( struct xfs_ail_cursor *cur) { cur->item = NULL; - if (cur == &ailp->xa_cursors) - return; - - cur->next = ailp->xa_cursors.next; - ailp->xa_cursors.next = cur; -} - -/* - * Set the cursor to the next item, because when we look - * up the cursor the current item may have been freed. - */ -STATIC void -xfs_trans_ail_cursor_set( - struct xfs_ail *ailp, - struct xfs_ail_cursor *cur, - struct xfs_log_item *lip) -{ - if (lip) - cur->item = xfs_ail_next(ailp, lip); + list_add_tail(&cur->list, &ailp->xa_cursors); } /* - * Get the next item in the traversal and advance the cursor. - * If the cursor was invalidated (inidicated by a lip of 1), - * restart the traversal. + * Get the next item in the traversal and advance the cursor. If the cursor + * was invalidated (indicated by a lip of 1), restart the traversal. */ struct xfs_log_item * xfs_trans_ail_cursor_next( @@ -216,45 +191,31 @@ xfs_trans_ail_cursor_next( if ((__psint_t)lip & 1) lip = xfs_ail_min(ailp); - xfs_trans_ail_cursor_set(ailp, cur, lip); + if (lip) + cur->item = xfs_ail_next(ailp, lip); return lip; } /* - * Now that the traversal is complete, we need to remove the cursor - * from the list of traversing cursors. Avoid removing the embedded - * push cursor, but use the fact it is always present to make the - * list deletion simple. + * When the traversal is complete, we need to remove the cursor from the list + * of traversing cursors. */ void xfs_trans_ail_cursor_done( struct xfs_ail *ailp, - struct xfs_ail_cursor *done) + struct xfs_ail_cursor *cur) { - struct xfs_ail_cursor *prev = NULL; - struct xfs_ail_cursor *cur; - - done->item = NULL; - if (done == &ailp->xa_cursors) - return; - prev = &ailp->xa_cursors; - for (cur = prev->next; cur; prev = cur, cur = prev->next) { - if (cur == done) { - prev->next = cur->next; - break; - } - } - ASSERT(cur); + cur->item = NULL; + list_del_init(&cur->list); } /* - * Invalidate any cursor that is pointing to this item. This is - * called when an item is removed from the AIL. Any cursor pointing - * to this object is now invalid and the traversal needs to be - * terminated so it doesn't reference a freed object. We set the - * cursor item to a value of 1 so we can distinguish between an - * invalidation and the end of the list when getting the next item - * from the cursor. + * Invalidate any cursor that is pointing to this item. This is called when an + * item is removed from the AIL. Any cursor pointing to this object is now + * invalid and the traversal needs to be terminated so it doesn't reference a + * freed object. We set the low bit of the cursor item pointer so we can + * distinguish between an invalidation and the end of the list when getting the + * next item from the cursor. */ STATIC void xfs_trans_ail_cursor_clear( @@ -263,8 +224,7 @@ xfs_trans_ail_cursor_clear( { struct xfs_ail_cursor *cur; - /* need to search all cursors */ - for (cur = &ailp->xa_cursors; cur; cur = cur->next) { + list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) ((__psint_t)cur->item | 1); @@ -272,9 +232,10 @@ xfs_trans_ail_cursor_clear( } /* - * Return the item in the AIL with the current lsn. - * Return the current tree generation number for use - * in calls to xfs_trans_next_ail(). + * Find the first item in the AIL with the given @lsn by searching in ascending + * LSN order and initialise the cursor to point to the next item for a + * ascending traversal. Pass a @lsn of zero to initialise the cursor to the + * first item in the AIL. Returns NULL if the list is empty. */ xfs_log_item_t * xfs_trans_ail_cursor_first( @@ -285,46 +246,112 @@ xfs_trans_ail_cursor_first( xfs_log_item_t *lip; xfs_trans_ail_cursor_init(ailp, cur); - lip = xfs_ail_min(ailp); - if (lsn == 0) + + if (lsn == 0) { + lip = xfs_ail_min(ailp); goto out; + } list_for_each_entry(lip, &ailp->xa_ail, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) goto out; } - lip = NULL; + return NULL; + out: - xfs_trans_ail_cursor_set(ailp, cur, lip); + if (lip) + cur->item = xfs_ail_next(ailp, lip); return lip; } +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Find the last item in the AIL with the given @lsn by searching in descending + * LSN order and initialise the cursor to point to that item. If there is no + * item with the value of @lsn, then it sets the cursor to the last item with an + * LSN lower than @lsn. Returns NULL if the list is empty. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + /* - * splice the log item list into the AIL at the given LSN. + * Splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. */ static void xfs_ail_splice( - struct xfs_ail *ailp, - struct list_head *list, - xfs_lsn_t lsn) + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) { - xfs_log_item_t *next_lip; + struct xfs_log_item *lip = cur ? cur->item : NULL; + struct xfs_log_item *next_lip; - /* If the list is empty, just insert the item. */ - if (list_empty(&ailp->xa_ail)) { - list_splice(list, &ailp->xa_ail); - return; + /* + * Get a new cursor if we don't have a placeholder or the existing one + * has been invalidated. + */ + if (!lip || (__psint_t)lip & 1) { + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + if (!lip) { + /* The list is empty, so just splice and return. */ + if (cur) + cur->item = NULL; + list_splice(list, &ailp->xa_ail); + return; + } } - list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) - break; + /* + * Our cursor points to the item we want to insert _after_, so we have + * to update the cursor to point to the end of the list we are splicing + * in so that it points to the correct location for the next splice. + * i.e. before the splice + * + * lsn -> lsn -> lsn + x -> lsn + x ... + * ^ + * | cursor points here + * + * After the splice we have: + * + * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... + * ^ ^ + * | cursor points here | needs to move here + * + * So we set the cursor to the last item in the list to be spliced + * before we execute the splice, resulting in the cursor pointing to + * the correct item after the splice occurs. + */ + if (cur) { + next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); + cur->item = next_lip; } - - ASSERT(&next_lip->li_ail == &ailp->xa_ail || - XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0); - - list_splice_init(list, &next_lip->li_ail); + list_splice(list, &lip->li_ail); } /* @@ -351,7 +378,7 @@ xfs_ail_worker( struct xfs_ail *ailp = container_of(to_delayed_work(work), struct xfs_ail, xa_work); xfs_mount_t *mp = ailp->xa_mount; - struct xfs_ail_cursor *cur = &ailp->xa_cursors; + struct xfs_ail_cursor cur; xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; @@ -363,13 +390,12 @@ xfs_ail_worker( spin_lock(&ailp->xa_lock); target = ailp->xa_target; - xfs_trans_ail_cursor_init(ailp, cur); - lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn); + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { /* * AIL is empty or our push has reached the end. */ - xfs_trans_ail_cursor_done(ailp, cur); + xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); goto out_done; } @@ -457,12 +483,12 @@ xfs_ail_worker( if (stuck > 100) break; - lip = xfs_trans_ail_cursor_next(ailp, cur); + lip = xfs_trans_ail_cursor_next(ailp, &cur); if (lip == NULL) break; lsn = lip->li_lsn; } - xfs_trans_ail_cursor_done(ailp, cur); + xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); if (flush_log) { @@ -645,6 +671,7 @@ xfs_trans_unlocked_item( void xfs_trans_ail_update_bulk( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock) @@ -674,7 +701,7 @@ xfs_trans_ail_update_bulk( list_add(&lip->li_ail, &tmp); } - xfs_ail_splice(ailp, &tmp, lsn); + xfs_ail_splice(ailp, cur, &tmp, lsn); if (!mlip_changed) { spin_unlock(&ailp->xa_lock); @@ -793,6 +820,7 @@ xfs_trans_ail_init( ailp->xa_mount = mp; INIT_LIST_HEAD(&ailp->xa_ail); + INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); mp->m_ail = ailp; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 03b3b7f85a3b..15584fc3ed7d 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -81,7 +81,7 @@ _xfs_trans_bjoin( struct xfs_buf_log_item *bip; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + ASSERT(bp->b_transp == NULL); /* * The xfs_buf_log_item pointer is stored in b_fsprivate. If @@ -89,7 +89,7 @@ _xfs_trans_bjoin( * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + bip = bp->b_fspriv; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); @@ -110,7 +110,7 @@ _xfs_trans_bjoin( * Initialize b_fsprivate2 so we can find it with incore_match() * in xfs_trans_get_buf() and friends above. */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); + bp->b_transp = tp; } @@ -160,7 +160,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, */ bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) XFS_BUF_SUPER_STALE(bp); @@ -172,8 +172,8 @@ xfs_trans_get_buf(xfs_trans_t *tp, else if (XFS_BUF_ISSTALE(bp)) ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -232,8 +232,8 @@ xfs_trans_getsb(xfs_trans_t *tp, * recursion count and return the buffer to the caller. */ bp = mp->m_sb_bp; - if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) { - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + if (bp->b_transp == tp) { + bip = bp->b_fspriv; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; @@ -327,9 +327,9 @@ xfs_trans_read_buf( */ bp = xfs_trans_buf_item_match(tp, target, blkno, len); if (bp != NULL) { - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_transp == tp); + ASSERT(bp->b_fspriv != NULL); ASSERT((XFS_BUF_ISERROR(bp)) == 0); if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); @@ -363,7 +363,7 @@ xfs_trans_read_buf( } - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + bip = bp->b_fspriv; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -460,32 +460,30 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_t *bp) { xfs_buf_log_item_t *bip; - xfs_log_item_t *lip; /* * Default to a normal brelse() call if the tp is NULL. */ if (tp == NULL) { - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + struct xfs_log_item *lip = bp->b_fspriv; + + ASSERT(bp->b_transp == NULL); + /* * If there's a buf log item attached to the buffer, * then let the AIL know that the buffer is being * unlocked. */ - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - if (lip->li_type == XFS_LI_BUF) { - bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); - xfs_trans_unlocked_item(bip->bli_item.li_ailp, - lip); - } + if (lip != NULL && lip->li_type == XFS_LI_BUF) { + bip = bp->b_fspriv; + xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip); } xfs_buf_relse(bp); return; } - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); @@ -556,7 +554,7 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_item_relse(bp); bip = NULL; } - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + bp->b_transp = NULL; /* * If we've still got a buf log item on the buffer, then @@ -581,16 +579,15 @@ void xfs_trans_bhold(xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); } @@ -603,19 +600,17 @@ void xfs_trans_bhold_release(xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); - bip->bli_flags &= ~XFS_BLI_HOLD; + bip->bli_flags &= ~XFS_BLI_HOLD; trace_xfs_trans_bhold_release(bip); } @@ -634,14 +629,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, uint first, uint last) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); - ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) || - (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks)); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); /* * Mark the buffer as needing to be written out eventually, @@ -656,9 +651,8 @@ xfs_trans_log_buf(xfs_trans_t *tp, XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; trace_xfs_trans_log_buf(bip); @@ -706,13 +700,11 @@ xfs_trans_binval( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); @@ -780,13 +772,11 @@ xfs_trans_inode_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; @@ -806,13 +796,11 @@ xfs_trans_stale_inode_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; @@ -833,13 +821,11 @@ xfs_trans_inode_alloc_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; @@ -863,16 +849,14 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_format.blf_flags |= type; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 048b0c689d3e..c8dea2fd7e68 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -55,7 +55,6 @@ xfs_trans_ijoin( { xfs_inode_log_item_t *iip; - ASSERT(ip->i_transp == NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); @@ -68,12 +67,6 @@ xfs_trans_ijoin( xfs_trans_add_item(tp, &iip->ili_item); xfs_trans_inode_broot_debug(ip); - - /* - * Initialize i_transp so we can find it with xfs_inode_incore() - * in xfs_trans_iget() above. - */ - ip->i_transp = tp; } /* @@ -111,7 +104,6 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_transp == tp); tv = current_fs_time(inode->i_sb); @@ -140,7 +132,6 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { - ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 6b164e9e9a1f..212946b97239 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -53,7 +53,7 @@ void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, * of the list to trigger traversal restarts. */ struct xfs_ail_cursor { - struct xfs_ail_cursor *next; + struct list_head list; struct xfs_log_item *item; }; @@ -66,7 +66,7 @@ struct xfs_ail { struct xfs_mount *xa_mount; struct list_head xa_ail; xfs_lsn_t xa_target; - struct xfs_ail_cursor xa_cursors; + struct list_head xa_cursors; spinlock_t xa_lock; struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; @@ -82,6 +82,7 @@ struct xfs_ail { extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); static inline void @@ -90,7 +91,7 @@ xfs_trans_ail_update( struct xfs_log_item *lip, xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, @@ -111,10 +112,13 @@ xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); -struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn); -struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 619720705bc6..88d121486c52 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -50,430 +50,6 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" -int -xfs_setattr( - struct xfs_inode *ip, - struct iattr *iattr, - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - struct inode *inode = VFS_I(ip); - int mask = iattr->ia_valid; - xfs_trans_t *tp; - int code; - uint lock_flags; - uint commit_flags=0; - uid_t uid=0, iuid=0; - gid_t gid=0, igid=0; - struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; - int need_iolock = 1; - - trace_xfs_setattr(ip); - - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - code = -inode_change_ok(inode, iattr); - if (code) - return code; - - olddquot1 = olddquot2 = NULL; - udqp = gdqp = NULL; - - /* - * If disk quotas is on, we make sure that the dquots do exist on disk, - * before we start any other transactions. Trying to do this later - * is messy. We don't care to take a readlock to look at the ids - * in inode here, because we can't hold it across the trans_reserve. - * If the IDs do change before we take the ilock, we're covered - * because the i_*dquot fields will get updated anyway. - */ - if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { - uint qflags = 0; - - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; - qflags |= XFS_QMOPT_UQUOTA; - } else { - uid = ip->i_d.di_uid; - } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; - qflags |= XFS_QMOPT_GQUOTA; - } else { - gid = ip->i_d.di_gid; - } - - /* - * We take a reference when we initialize udqp and gdqp, - * so it is important that we never blindly double trip on - * the same variable. See xfs_create() for an example. - */ - ASSERT(udqp == NULL); - ASSERT(gdqp == NULL); - code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), - qflags, &udqp, &gdqp); - if (code) - return code; - } - - /* - * For the other attributes, we acquire the inode lock and - * first do an error checking pass. - */ - tp = NULL; - lock_flags = XFS_ILOCK_EXCL; - if (flags & XFS_ATTR_NOLOCK) - need_iolock = 0; - if (!(mask & ATTR_SIZE)) { - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - commit_flags = 0; - code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), - 0, 0, 0); - if (code) { - lock_flags = 0; - goto error_return; - } - } else { - if (need_iolock) - lock_flags |= XFS_IOLOCK_EXCL; - } - - xfs_ilock(ip, lock_flags); - - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = ip->i_d.di_uid; - igid = ip->i_d.di_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * Do a quota reservation only if uid/gid is actually - * going to change. - */ - if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { - ASSERT(tp); - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_return; - } - } - - /* - * Truncate file. Must have write permission and not be a directory. - */ - if (mask & ATTR_SIZE) { - /* Short circuit the truncate case for zero length files */ - if (iattr->ia_size == 0 && - ip->i_size == 0 && ip->i_d.di_nextents == 0) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; - if (mask & ATTR_CTIME) { - inode->i_mtime = inode->i_ctime = - current_fs_time(inode->i_sb); - xfs_mark_inode_dirty_sync(ip); - } - code = 0; - goto error_return; - } - - if (S_ISDIR(ip->i_d.di_mode)) { - code = XFS_ERROR(EISDIR); - goto error_return; - } else if (!S_ISREG(ip->i_d.di_mode)) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - - /* - * Make sure that the dquots are attached to the inode. - */ - code = xfs_qm_dqattach_locked(ip, 0); - if (code) - goto error_return; - - /* - * Now we can make the changes. Before we join the inode - * to the transaction, if ATTR_SIZE is set then take care of - * the part of the truncation that must be done without the - * inode lock. This needs to be done before joining the inode - * to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. - */ - if (iattr->ia_size > ip->i_size) { - /* - * Do the first part of growing a file: zero any data - * in the last block that is beyond the old EOF. We - * need to do this before the inode is joined to the - * transaction to modify the i_size. - */ - code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); - if (code) - goto error_return; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; - - /* - * We are going to log the inode size change in this - * transaction so any previous writes that are beyond the on - * disk EOF and the new EOF that have not been written out need - * to be written here. If we do not write the data out, we - * expose ourselves to the null files problem. - * - * Only flush from the on disk size to the smaller of the in - * memory file size or the new size as that's the range we - * really care about here and prevents waiting for other data - * not within the range we care about here. - */ - if (ip->i_size != ip->i_d.di_size && - iattr->ia_size > ip->i_d.di_size) { - code = xfs_flush_pages(ip, - ip->i_d.di_size, iattr->ia_size, - XBF_ASYNC, FI_NONE); - if (code) - goto error_return; - } - - /* wait for all I/O to complete */ - xfs_ioend_wait(ip); - - code = -block_truncate_page(inode->i_mapping, iattr->ia_size, - xfs_get_blocks); - if (code) - goto error_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (code) - goto error_return; - - truncate_setsize(inode, iattr->ia_size); - - commit_flags = XFS_TRANS_RELEASE_LOG_RES; - lock_flags |= XFS_ILOCK_EXCL; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - - /* - * Only change the c/mtime if we are changing the size - * or we are explicitly asked to change it. This handles - * the semantic difference between truncate() and ftruncate() - * as implemented in the VFS. - * - * The regular truncate() case without ATTR_CTIME and ATTR_MTIME - * is a special case where we need to update the times despite - * not having these flags set. For all other operations the - * VFS set these flags explicitly if it wants a timestamp - * update. - */ - if (iattr->ia_size != ip->i_size && - (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { - iattr->ia_ctime = iattr->ia_mtime = - current_fs_time(inode->i_sb); - mask |= ATTR_CTIME | ATTR_MTIME; - } - - if (iattr->ia_size > ip->i_size) { - ip->i_d.di_size = iattr->ia_size; - ip->i_size = iattr->ia_size; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } else if (iattr->ia_size <= ip->i_size || - (iattr->ia_size == 0 && ip->i_d.di_nextents)) { - /* - * signal a sync transaction unless - * we're truncating an already unlinked - * file on a wsync filesystem - */ - code = xfs_itruncate_finish(&tp, ip, iattr->ia_size, - XFS_DATA_FORK, - ((ip->i_d.di_nlink != 0 || - !(mp->m_flags & XFS_MOUNT_WSYNC)) - ? 1 : 0)); - if (code) - goto abort_return; - /* - * Truncated "down", so we're removing references - * to old data here - if we now delay flushing for - * a long time, we expose ourselves unduly to the - * notorious NULL files problem. So, we mark this - * vnode and flush it when the file is closed, and - * do not wait the usual (long) time for writeout. - */ - xfs_iflags_set(ip, XFS_ITRUNCATED); - } - } else if (tp) { - xfs_trans_ijoin(tp, ip); - } - - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) { - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); - } - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (iuid != uid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - ip->i_d.di_uid = uid; - inode->i_uid = uid; - } - if (igid != gid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_gid = gid; - inode->i_gid = gid; - } - } - - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } - - /* - * Change file access or modified times. - */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; - } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - ip->i_update_core = 1; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - ip->i_update_core = 1; - } - - /* - * And finally, log the inode core if any attribute in it - * has been changed. - */ - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| - ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - XFS_STATS_INC(xs_ig_attrchg); - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - * This is slightly sub-optimal in that truncates require - * two sync transactions instead of one for wsync filesystems. - * One for the truncate and one for the timestamps since we - * don't want to change the timestamps unless we're sure the - * truncate worked. Truncates are less than 1% of the laddis - * mix so this probably isn't worth the trouble to optimize. - */ - code = 0; - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); - - code = xfs_trans_commit(tp, commit_flags); - - xfs_iunlock(ip, lock_flags); - - /* - * Release any dquot(s) the inode had kept before chown. - */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (code) - return code; - - /* - * XXX(hch): Updating the ACL entries is not atomic vs the i_mode - * update. We could avoid this with linked transactions - * and passing down the transaction pointer all the way - * to attr_set. No previous user of the generic - * Posix ACL code seems to care about this issue either. - */ - if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - code = -xfs_acl_chmod(inode); - if (code) - return XFS_ERROR(code); - } - - return 0; - - abort_return: - commit_flags |= XFS_TRANS_ABORT; - error_return: - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - if (tp) { - xfs_trans_cancel(tp, commit_flags); - } - if (lock_flags != 0) { - xfs_iunlock(ip, lock_flags); - } - return code; -} - /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 2 extents back from @@ -621,13 +197,6 @@ xfs_free_eofblocks( */ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - /* - * Do the xfs_itruncate_start() call before - * reserving any log space because - * itruncate_start will call into the buffer - * cache and we can't - * do that within a transaction. - */ if (flags & XFS_FREE_EOF_TRYLOCK) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); @@ -636,13 +205,6 @@ xfs_free_eofblocks( } else { xfs_ilock(ip, XFS_IOLOCK_EXCL); } - error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, - ip->i_size); - if (error) { - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), @@ -658,15 +220,12 @@ xfs_free_eofblocks( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip); - error = xfs_itruncate_finish(&tp, ip, - ip->i_size, - XFS_DATA_FORK, - 0); - /* - * If we get an error at this point we - * simply don't bother truncating the file. - */ + error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); @@ -1084,20 +643,9 @@ xfs_inactive( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); if (truncate) { - /* - * Do the xfs_itruncate_start() call before - * reserving any log space because itruncate_start - * will call into the buffer cache and we can't - * do that within a transaction. - */ xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); - if (error) { - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return VN_INACTIVE_CACHE; - } + xfs_ioend_wait(ip); error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), @@ -1114,16 +662,7 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip); - /* - * normally, we have to run xfs_itruncate_finish sync. - * But if filesystem is wsync and we're in the inactive - * path, then we know that nlink == 0, and that the - * xaction that made nlink == 0 is permanently committed - * since xfs_remove runs as a synchronous transaction. - */ - error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, - (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0)); - + error = xfs_itruncate_data(&tp, ip, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -2430,6 +1969,8 @@ xfs_zero_remaining_bytes( if (!bp) return XFS_ERROR(ENOMEM); + xfs_buf_unlock(bp); + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; @@ -2784,7 +2325,7 @@ xfs_change_file_space( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = startoffset; - error = xfs_setattr(ip, &iattr, attr_flags); + error = xfs_setattr_size(ip, &iattr, attr_flags); if (error) return error; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 3bcd23353d6c..35d3d513e1e9 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -13,7 +13,8 @@ struct xfs_inode; struct xfs_iomap; -int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); +int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); +int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 2fb1dcbcb5aa..9962c6bb1311 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -59,8 +59,6 @@ struct lguest_data { unsigned long reserve_mem; /* KHz for the TSC clock. */ u32 tsc_khz; - /* Page where the top-level pagetable is */ - unsigned long pgdir; /* Fields initialized by the Guest at boot: */ /* Instruction range to suppress interrupts even if enabled */ diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 5a90266c3a5a..0dc98044d8b7 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -68,6 +68,11 @@ * controller and report the event to the driver. */ #define TMIO_MMC_HAS_COLD_CD (1 << 3) +/* + * Some controllers require waiting for the SD bus to become + * idle before writing to some registers. + */ +#define TMIO_MMC_HAS_IDLE_WAIT (1 << 4) int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); @@ -80,6 +85,8 @@ struct tmio_mmc_dma { int alignment_shift; }; +struct tmio_mmc_host; + /* * data for the MMC controller */ @@ -94,6 +101,7 @@ struct tmio_mmc_data { void (*set_pwr)(struct platform_device *host, int state); void (*set_clk_div)(struct platform_device *host, int state); int (*get_cd)(struct platform_device *host); + int (*write16_hook)(struct tmio_mmc_host *host, int addr); }; static inline void tmio_mmc_cd_wakeup(struct tmio_mmc_data *pdata) diff --git a/include/linux/mmc/boot.h b/include/linux/mmc/boot.h index 39d787c229cb..23acc3baa07d 100644 --- a/include/linux/mmc/boot.h +++ b/include/linux/mmc/boot.h @@ -1,7 +1,7 @@ -#ifndef MMC_BOOT_H -#define MMC_BOOT_H +#ifndef LINUX_MMC_BOOT_H +#define LINUX_MMC_BOOT_H enum { MMC_PROGRESS_ENTER, MMC_PROGRESS_INIT, MMC_PROGRESS_LOAD, MMC_PROGRESS_DONE }; -#endif +#endif /* LINUX_MMC_BOOT_H */ diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 6ad43554ac05..b460fc2af8a1 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -403,4 +403,4 @@ extern void mmc_unregister_driver(struct mmc_driver *); extern void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table); -#endif +#endif /* LINUX_MMC_CARD_H */ diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index b6718e549a51..b8b1b7a311f1 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -117,6 +117,7 @@ struct mmc_data { unsigned int sg_len; /* size of scatter list */ struct scatterlist *sg; /* I/O scatter list */ + s32 host_cookie; /* host private data */ }; struct mmc_request { @@ -125,13 +126,16 @@ struct mmc_request { struct mmc_data *data; struct mmc_command *stop; - void *done_data; /* completion data */ + struct completion completion; void (*done)(struct mmc_request *);/* completion function */ }; struct mmc_host; struct mmc_card; +struct mmc_async_req; +extern struct mmc_async_req *mmc_start_req(struct mmc_host *, + struct mmc_async_req *, int *); extern void mmc_wait_for_req(struct mmc_host *, struct mmc_request *); extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int); extern int mmc_app_cmd(struct mmc_host *, struct mmc_card *); @@ -155,6 +159,7 @@ extern int mmc_can_trim(struct mmc_card *card); extern int mmc_can_secure_erase_trim(struct mmc_card *card); extern int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from, unsigned int nr); +extern unsigned int mmc_calc_max_discard(struct mmc_card *card); extern int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen); @@ -179,4 +184,4 @@ static inline void mmc_claim_host(struct mmc_host *host) extern u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max); -#endif +#endif /* LINUX_MMC_CORE_H */ diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index bdd7ceeb99e4..6b46819705d1 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -11,8 +11,8 @@ * (at your option) any later version. */ -#ifndef _LINUX_MMC_DW_MMC_H_ -#define _LINUX_MMC_DW_MMC_H_ +#ifndef LINUX_MMC_DW_MMC_H +#define LINUX_MMC_DW_MMC_H #define MAX_MCI_SLOTS 2 @@ -48,6 +48,7 @@ struct mmc_data; * @data: The data currently being transferred, or NULL if no data * transfer is in progress. * @use_dma: Whether DMA channel is initialized or not. + * @using_dma: Whether DMA is in use for the current transfer. * @sg_dma: Bus address of DMA buffer. * @sg_cpu: Virtual address of DMA buffer. * @dma_ops: Pointer to platform-specific DMA callbacks. @@ -74,7 +75,11 @@ struct mmc_data; * @pdev: Platform device associated with the MMC controller. * @pdata: Platform data associated with the MMC controller. * @slot: Slots sharing this MMC controller. + * @fifo_depth: depth of FIFO. * @data_shift: log2 of FIFO item size. + * @part_buf_start: Start index in part_buf. + * @part_buf_count: Bytes of partial data in part_buf. + * @part_buf: Simple buffer for partial fifo reads/writes. * @push_data: Pointer to FIFO push function. * @pull_data: Pointer to FIFO pull function. * @quirks: Set of quirks that apply to specific versions of the IP. @@ -117,6 +122,7 @@ struct dw_mci { /* DMA interface members*/ int use_dma; + int using_dma; dma_addr_t sg_dma; void *sg_cpu; @@ -131,7 +137,7 @@ struct dw_mci { u32 stop_cmdr; u32 dir_status; struct tasklet_struct tasklet; - struct tasklet_struct card_tasklet; + struct work_struct card_work; unsigned long pending_events; unsigned long completed_events; enum dw_mci_state state; @@ -146,7 +152,15 @@ struct dw_mci { struct dw_mci_slot *slot[MAX_MCI_SLOTS]; /* FIFO push and pull */ + int fifo_depth; int data_shift; + u8 part_buf_start; + u8 part_buf_count; + union { + u16 part_buf16; + u32 part_buf32; + u64 part_buf; + }; void (*push_data)(struct dw_mci *host, void *buf, int cnt); void (*pull_data)(struct dw_mci *host, void *buf, int cnt); @@ -196,6 +210,12 @@ struct dw_mci_board { unsigned int bus_hz; /* Bus speed */ unsigned int caps; /* Capabilities */ + /* + * Override fifo depth. If 0, autodetect it from the FIFOTH register, + * but note that this may not be reliable after a bootloader has used + * it. + */ + unsigned int fifo_depth; /* delay in mS before detecting cards after interrupt */ u32 detect_delay_ms; @@ -219,4 +239,4 @@ struct dw_mci_board { struct block_settings *blk_settings; }; -#endif /* _LINUX_MMC_DW_MMC_H_ */ +#endif /* LINUX_MMC_DW_MMC_H */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 1ee4424462eb..0f83858147a6 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -106,6 +106,15 @@ struct mmc_host_ops { */ int (*enable)(struct mmc_host *host); int (*disable)(struct mmc_host *host, int lazy); + /* + * It is optional for the host to implement pre_req and post_req in + * order to support double buffering of requests (prepare one + * request while another request is active). + */ + void (*post_req)(struct mmc_host *host, struct mmc_request *req, + int err); + void (*pre_req)(struct mmc_host *host, struct mmc_request *req, + bool is_first_req); void (*request)(struct mmc_host *host, struct mmc_request *req); /* * Avoid calling these three functions too often or in a "fast path", @@ -139,11 +148,22 @@ struct mmc_host_ops { int (*start_signal_voltage_switch)(struct mmc_host *host, struct mmc_ios *ios); int (*execute_tuning)(struct mmc_host *host); void (*enable_preset_value)(struct mmc_host *host, bool enable); + int (*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv); }; struct mmc_card; struct device; +struct mmc_async_req { + /* active mmc request */ + struct mmc_request *mrq; + /* + * Check error status of completed mmc request. + * Returns 0 if success otherwise non zero. + */ + int (*err_check) (struct mmc_card *, struct mmc_async_req *); +}; + struct mmc_host { struct device *parent; struct device class_dev; @@ -231,6 +251,7 @@ struct mmc_host { unsigned int max_req_size; /* maximum number of bytes in one req */ unsigned int max_blk_size; /* maximum size of one mmc block */ unsigned int max_blk_count; /* maximum number of blocks in one req */ + unsigned int max_discard_to; /* max. discard timeout in ms */ /* private data */ spinlock_t lock; /* lock for claim and bus ops */ @@ -281,6 +302,8 @@ struct mmc_host { struct dentry *debugfs_root; + struct mmc_async_req *areq; /* active async req */ + unsigned long private[0] ____cacheline_aligned; }; @@ -373,5 +396,4 @@ static inline int mmc_host_cmd23(struct mmc_host *host) { return host->caps & MMC_CAP_CMD23; } -#endif - +#endif /* LINUX_MMC_HOST_H */ diff --git a/include/linux/mmc/ioctl.h b/include/linux/mmc/ioctl.h index 5baf2983a12f..8fa5bc5f8059 100644 --- a/include/linux/mmc/ioctl.h +++ b/include/linux/mmc/ioctl.h @@ -51,4 +51,4 @@ struct mmc_ioc_cmd { * block device operations. */ #define MMC_IOC_MAX_BYTES (512L * 256) -#endif /* LINUX_MMC_IOCTL_H */ +#endif /* LINUX_MMC_IOCTL_H */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index ac26a685cca8..5a794cb503ea 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -21,8 +21,8 @@ * 15 May 2002 */ -#ifndef MMC_MMC_H -#define MMC_MMC_H +#ifndef LINUX_MMC_MMC_H +#define LINUX_MMC_MMC_H /* Standard MMC commands (4.1) type argument response */ /* class 1 */ @@ -140,6 +140,16 @@ static inline bool mmc_op_multi(u32 opcode) #define R1_SWITCH_ERROR (1 << 7) /* sx, c */ #define R1_APP_CMD (1 << 5) /* sr, c */ +#define R1_STATE_IDLE 0 +#define R1_STATE_READY 1 +#define R1_STATE_IDENT 2 +#define R1_STATE_STBY 3 +#define R1_STATE_TRAN 4 +#define R1_STATE_DATA 5 +#define R1_STATE_RCV 6 +#define R1_STATE_PRG 7 +#define R1_STATE_DIS 8 + /* * MMC/SD in SPI mode reports R1 status always, and R2 for SEND_STATUS * R1 is the low order byte; R2 is the next highest byte, when present. @@ -327,5 +337,4 @@ struct _mmc_csd { #define MMC_SWITCH_MODE_CLEAR_BITS 0x02 /* Clear bits which are 1 in value */ #define MMC_SWITCH_MODE_WRITE_BYTE 0x03 /* Set target to value */ -#endif /* MMC_MMC_PROTOCOL_H */ - +#endif /* LINUX_MMC_MMC_H */ diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h index d37aac49cf9a..4a139204c20c 100644 --- a/include/linux/mmc/pm.h +++ b/include/linux/mmc/pm.h @@ -27,4 +27,4 @@ typedef unsigned int mmc_pm_flag_t; #define MMC_PM_KEEP_POWER (1 << 0) /* preserve card power during suspend */ #define MMC_PM_WAKE_SDIO_IRQ (1 << 1) /* wake up host system on SDIO IRQ assertion */ -#endif +#endif /* LINUX_MMC_PM_H */ diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h index 7d35d52c3df3..1ebcf9ba1256 100644 --- a/include/linux/mmc/sd.h +++ b/include/linux/mmc/sd.h @@ -9,8 +9,8 @@ * your option) any later version. */ -#ifndef MMC_SD_H -#define MMC_SD_H +#ifndef LINUX_MMC_SD_H +#define LINUX_MMC_SD_H /* SD commands type argument response */ /* class 0 */ @@ -91,5 +91,4 @@ #define SD_SWITCH_ACCESS_DEF 0 #define SD_SWITCH_ACCESS_HS 1 -#endif - +#endif /* LINUX_MMC_SD_H */ diff --git a/include/linux/mmc/sdhci-pltfm.h b/include/linux/mmc/sdhci-pltfm.h deleted file mode 100644 index 548d59d404cb..000000000000 --- a/include/linux/mmc/sdhci-pltfm.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Platform data declarations for the sdhci-pltfm driver. - * - * Copyright (c) 2010 MontaVista Software, LLC. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - */ - -#ifndef _SDHCI_PLTFM_H -#define _SDHCI_PLTFM_H - -struct sdhci_ops; -struct sdhci_host; - -/** - * struct sdhci_pltfm_data - SDHCI platform-specific information & hooks - * @ops: optional pointer to the platform-provided SDHCI ops - * @quirks: optional SDHCI quirks - * @init: optional hook that is called during device probe, before the - * driver tries to access any SDHCI registers - * @exit: optional hook that is called during device removal - */ -struct sdhci_pltfm_data { - struct sdhci_ops *ops; - unsigned int quirks; - int (*init)(struct sdhci_host *host, struct sdhci_pltfm_data *pdata); - void (*exit)(struct sdhci_host *host); -}; - -#endif /* _SDHCI_PLTFM_H */ diff --git a/include/linux/mmc/sdhci-spear.h b/include/linux/mmc/sdhci-spear.h index 9188c973f3e1..5cdc96da9dd5 100644 --- a/include/linux/mmc/sdhci-spear.h +++ b/include/linux/mmc/sdhci-spear.h @@ -11,8 +11,8 @@ * warranty of any kind, whether express or implied. */ -#ifndef MMC_SDHCI_SPEAR_H -#define MMC_SDHCI_SPEAR_H +#ifndef LINUX_MMC_SDHCI_SPEAR_H +#define LINUX_MMC_SDHCI_SPEAR_H #include /* @@ -39,4 +39,4 @@ sdhci_set_plat_data(struct platform_device *pdev, struct sdhci_plat_data *data) pdev->dev.platform_data = data; } -#endif /* MMC_SDHCI_SPEAR_H */ +#endif /* LINUX_MMC_SDHCI_SPEAR_H */ diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h index 6a68c4eb4e44..5666f3abfab7 100644 --- a/include/linux/mmc/sdhci.h +++ b/include/linux/mmc/sdhci.h @@ -8,8 +8,8 @@ * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. */ -#ifndef __SDHCI_H -#define __SDHCI_H +#ifndef LINUX_MMC_SDHCI_H +#define LINUX_MMC_SDHCI_H #include #include @@ -162,4 +162,4 @@ struct sdhci_host { unsigned long private[0] ____cacheline_aligned; }; -#endif /* __SDHCI_H */ +#endif /* LINUX_MMC_SDHCI_H */ diff --git a/include/linux/mmc/sdio.h b/include/linux/mmc/sdio.h index 245cdacee544..2a2e9905a247 100644 --- a/include/linux/mmc/sdio.h +++ b/include/linux/mmc/sdio.h @@ -9,8 +9,8 @@ * your option) any later version. */ -#ifndef MMC_SDIO_H -#define MMC_SDIO_H +#ifndef LINUX_MMC_SDIO_H +#define LINUX_MMC_SDIO_H /* SDIO commands type argument response */ #define SD_IO_SEND_OP_COND 5 /* bcr [23:0] OCR R4 */ @@ -161,5 +161,4 @@ #define SDIO_FBR_BLKSIZE 0x10 /* block size (2 bytes) */ -#endif - +#endif /* LINUX_MMC_SDIO_H */ diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index 31baaf82f458..50f0bc952328 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -9,8 +9,8 @@ * your option) any later version. */ -#ifndef MMC_SDIO_FUNC_H -#define MMC_SDIO_FUNC_H +#ifndef LINUX_MMC_SDIO_FUNC_H +#define LINUX_MMC_SDIO_FUNC_H #include #include @@ -161,5 +161,4 @@ extern void sdio_f0_writeb(struct sdio_func *func, unsigned char b, extern mmc_pm_flag_t sdio_get_host_pm_caps(struct sdio_func *func); extern int sdio_set_host_pm_flags(struct sdio_func *func, mmc_pm_flag_t flags); -#endif - +#endif /* LINUX_MMC_SDIO_FUNC_H */ diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index a36ab3bc7b03..9f03feedc8e7 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -2,8 +2,8 @@ * SDIO Classes, Interface Types, Manufacturer IDs, etc. */ -#ifndef MMC_SDIO_IDS_H -#define MMC_SDIO_IDS_H +#ifndef LINUX_MMC_SDIO_IDS_H +#define LINUX_MMC_SDIO_IDS_H /* * Standard SDIO Function Interfaces @@ -44,4 +44,4 @@ #define SDIO_DEVICE_ID_SIANO_NOVA_A0 0x1100 #define SDIO_DEVICE_ID_SIANO_STELLAR 0x5347 -#endif +#endif /* LINUX_MMC_SDIO_IDS_H */ diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h index 9eb9b4b96f55..0222cd8ebe76 100644 --- a/include/linux/mmc/sh_mmcif.h +++ b/include/linux/mmc/sh_mmcif.h @@ -11,8 +11,8 @@ * */ -#ifndef __SH_MMCIF_H__ -#define __SH_MMCIF_H__ +#ifndef LINUX_MMC_SH_MMCIF_H +#define LINUX_MMC_SH_MMCIF_H #include #include @@ -220,4 +220,4 @@ static inline void sh_mmcif_boot_init(void __iomem *base) sh_mmcif_boot_cmd(base, 0x03400040, 0x00010000); } -#endif /* __SH_MMCIF_H__ */ +#endif /* LINUX_MMC_SH_MMCIF_H */ diff --git a/include/linux/mmc/sh_mobile_sdhi.h b/include/linux/mmc/sh_mobile_sdhi.h index faf32b6ec185..bd50b365167f 100644 --- a/include/linux/mmc/sh_mobile_sdhi.h +++ b/include/linux/mmc/sh_mobile_sdhi.h @@ -1,5 +1,5 @@ -#ifndef __SH_MOBILE_SDHI_H__ -#define __SH_MOBILE_SDHI_H__ +#ifndef LINUX_MMC_SH_MOBILE_SDHI_H +#define LINUX_MMC_SH_MOBILE_SDHI_H #include @@ -17,4 +17,4 @@ struct sh_mobile_sdhi_info { int (*get_cd)(struct platform_device *pdev); }; -#endif /* __SH_MOBILE_SDHI_H__ */ +#endif /* LINUX_MMC_SH_MOBILE_SDHI_H */ diff --git a/include/linux/mmc/tmio.h b/include/linux/mmc/tmio.h index 19490b942db0..a1c1f321e519 100644 --- a/include/linux/mmc/tmio.h +++ b/include/linux/mmc/tmio.h @@ -12,8 +12,8 @@ * * TC6393XB TC6391XB TC6387XB T7L66XB ASIC3 */ -#ifndef _LINUX_MMC_TMIO_H_ -#define _LINUX_MMC_TMIO_H_ +#ifndef LINUX_MMC_TMIO_H +#define LINUX_MMC_TMIO_H #define CTL_SD_CMD 0x00 #define CTL_ARG_REG 0x04 @@ -21,6 +21,7 @@ #define CTL_XFER_BLK_COUNT 0xa #define CTL_RESPONSE 0x0c #define CTL_STATUS 0x1c +#define CTL_STATUS2 0x1e #define CTL_IRQ_MASK 0x20 #define CTL_SD_CARD_CLK_CTL 0x24 #define CTL_SD_XFER_LEN 0x26 @@ -30,6 +31,7 @@ #define CTL_TRANSACTION_CTL 0x34 #define CTL_SDIO_STATUS 0x36 #define CTL_SDIO_IRQ_MASK 0x38 +#define CTL_DMA_ENABLE 0xd8 #define CTL_RESET_SD 0xe0 #define CTL_SDIO_REGS 0x100 #define CTL_CLK_AND_WAIT_CTL 0x138 @@ -60,4 +62,4 @@ #define TMIO_BBS 512 /* Boot block size */ -#endif /* _LINUX_MMC_TMIO_H_ */ +#endif /* LINUX_MMC_TMIO_H */ diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index 15da0e99f48a..db4836bed514 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -155,12 +155,14 @@ struct ubi_device_info { }; /* - * enum - volume notification types. - * @UBI_VOLUME_ADDED: volume has been added - * @UBI_VOLUME_REMOVED: start volume volume - * @UBI_VOLUME_RESIZED: volume size has been re-sized - * @UBI_VOLUME_RENAMED: volume name has been re-named - * @UBI_VOLUME_UPDATED: volume name has been updated + * Volume notification types. + * @UBI_VOLUME_ADDED: a volume has been added (an UBI device was attached or a + * volume was created) + * @UBI_VOLUME_REMOVED: a volume has been removed (an UBI device was detached + * or a volume was removed) + * @UBI_VOLUME_RESIZED: a volume has been re-sized + * @UBI_VOLUME_RENAMED: a volume has been re-named + * @UBI_VOLUME_UPDATED: data has been written to a volume * * These constants define which type of event has happened when a volume * notification function is invoked. diff --git a/include/linux/platform_data/pxa_sdhci.h b/include/linux/platform_data/pxa_sdhci.h new file mode 100644 index 000000000000..51ad0995abac --- /dev/null +++ b/include/linux/platform_data/pxa_sdhci.h @@ -0,0 +1,60 @@ +/* + * include/linux/platform_data/pxa_sdhci.h + * + * Copyright 2010 Marvell + * Zhangfei Gao + * + * PXA Platform - SDHCI platform data definitions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _PXA_SDHCI_H_ +#define _PXA_SDHCI_H_ + +/* pxa specific flag */ +/* Require clock free running */ +#define PXA_FLAG_ENABLE_CLOCK_GATING (1<<0) +/* card always wired to host, like on-chip emmc */ +#define PXA_FLAG_CARD_PERMANENT (1<<1) +/* Board design supports 8-bit data on SD/SDIO BUS */ +#define PXA_FLAG_SD_8_BIT_CAPABLE_SLOT (1<<2) + +/* + * struct pxa_sdhci_platdata() - Platform device data for PXA SDHCI + * @flags: flags for platform requirement + * @clk_delay_cycles: + * mmp2: each step is roughly 100ps, 5bits width + * pxa910: each step is 1ns, 4bits width + * @clk_delay_sel: select clk_delay, used on pxa910 + * 0: choose feedback clk + * 1: choose feedback clk + delay value + * 2: choose internal clk + * @clk_delay_enable: enable clk_delay or not, used on pxa910 + * @ext_cd_gpio: gpio pin used for external CD line + * @ext_cd_gpio_invert: invert values for external CD gpio line + * @max_speed: the maximum speed supported + * @host_caps: Standard MMC host capabilities bit field. + * @quirks: quirks of platfrom + * @pm_caps: pm_caps of platfrom + */ +struct sdhci_pxa_platdata { + unsigned int flags; + unsigned int clk_delay_cycles; + unsigned int clk_delay_sel; + bool clk_delay_enable; + unsigned int ext_cd_gpio; + bool ext_cd_gpio_invert; + unsigned int max_speed; + unsigned int host_caps; + unsigned int quirks; + unsigned int pm_caps; +}; + +struct sdhci_pxa { + u8 clk_enable; + u8 power_mode; +}; +#endif /* _PXA_SDHCI_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 496770a96487..14a6c7b545de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -844,6 +844,7 @@ enum cpu_idle_type { #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -893,16 +894,21 @@ static inline int sd_power_saving_flags(void) return 0; } -struct sched_group { - struct sched_group *next; /* Must be a circular list */ +struct sched_group_power { atomic_t ref; - /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. */ - unsigned int cpu_power, cpu_power_orig; + unsigned int power, power_orig; +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + unsigned int group_weight; + struct sched_group_power *sgp; /* * The CPUs this group covers. @@ -1254,6 +1260,9 @@ struct task_struct { #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; +#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) + int rcu_boosted; +#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */ struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TREE_PREEMPT_RCU diff --git a/include/linux/slab.h b/include/linux/slab.h index ad4dd1c8d30a..573c809c33d9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -133,6 +133,26 @@ unsigned int kmem_cache_size(struct kmem_cache *); #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_HIGH) #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_HIGH - PAGE_SHIFT) +/* + * Some archs want to perform DMA into kmalloc caches and need a guaranteed + * alignment larger than the alignment of a 64-bit integer. + * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that. + */ +#ifdef ARCH_DMA_MINALIGN +#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN +#else +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +/* + * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. + * Intended for arches that get misalignment faults even for 64 bit integer + * aligned buffers. + */ +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + /* * Common kmalloc functions provided by all allocators */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 83203ae9390b..d00e0bacda93 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -17,32 +17,6 @@ #include -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#ifdef ARCH_DMA_MINALIGN -#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN -#else -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif - /* * struct kmem_cache * @@ -50,21 +24,19 @@ */ struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ +/* 1) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; unsigned int buffer_size; u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ +/* 2) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ -/* 4) cache_grow/shrink */ +/* 3) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; @@ -80,11 +52,11 @@ struct kmem_cache { /* constructor func */ void (*ctor)(void *obj); -/* 5) cache creation/removal */ +/* 4) cache creation/removal */ const char *name; struct list_head next; -/* 6) statistics */ +/* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB unsigned long num_active; unsigned long num_allocations; @@ -111,16 +83,18 @@ struct kmem_cache { int obj_size; #endif /* CONFIG_DEBUG_SLAB */ +/* 6) per-cpu/per-node data, touched during every alloc/free */ /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES + * We put array[] at the end of kmem_cache, because we want to size + * this array to nr_cpu_ids slots instead of NR_CPUS * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. + * We still use [NR_CPUS] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of cpus. */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; + struct kmem_list3 **nodelists; + struct array_cache *array[NR_CPUS]; /* - * Do not add fields after nodelists[] + * Do not add fields after array[] */ }; diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index 4382db09df4f..0ec00b39d006 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -1,16 +1,6 @@ #ifndef __LINUX_SLOB_DEF_H #define __LINUX_SLOB_DEF_H -#ifdef ARCH_DMA_MINALIGN -#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN -#else -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) -#endif - void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index c8668d161dd8..4b35c06dfbc5 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -113,16 +113,6 @@ struct kmem_cache { #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) -#ifdef ARCH_DMA_MINALIGN -#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN -#else -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - /* * Maximum kmalloc object size handled by SLUB. Larger object allocations * are passed through to the page allocator. The page allocator "fastpath" @@ -228,6 +218,19 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order) return ret; } +/** + * Calling this on allocated memory will check that the memory + * is expected to be in use, and print warnings if not. + */ +#ifdef CONFIG_SLUB_DEBUG +extern bool verify_mem_not_deleted(const void *x); +#else +static inline bool verify_mem_not_deleted(const void *x) +{ + return true; +} +#endif + #ifdef CONFIG_TRACING extern void * kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size); diff --git a/include/xen/balloon.h b/include/xen/balloon.h index a2b22f01a51d..4076ed72afbd 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -23,3 +23,13 @@ void balloon_set_new_target(unsigned long target); int alloc_xenballooned_pages(int nr_pages, struct page** pages); void free_xenballooned_pages(int nr_pages, struct page** pages); + +struct sys_device; +#ifdef CONFIG_XEN_SELFBALLOONING +extern int register_xen_selfballooning(struct sys_device *sysdev); +#else +static inline int register_xen_selfballooning(struct sys_device *sysdev) +{ + return -ENOSYS; +} +#endif diff --git a/include/xen/events.h b/include/xen/events.h index 9af21e19545a..d287997d3eab 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -74,8 +74,6 @@ int xen_set_callback_via(uint64_t via); void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_hvm_evtchn_do_upcall(void); -/* Allocate a pirq for a physical interrupt, given a gsi. */ -int xen_allocate_pirq_gsi(unsigned gsi); /* Bind a pirq for a physical interrupt to an irq. */ int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name); diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index c3adde32669b..901724dc528d 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -6,11 +6,13 @@ extern struct console xenboot_console; #ifdef CONFIG_HVC_XEN void xen_console_resume(void); void xen_raw_console_write(const char *str); +__attribute__((format(printf, 1, 2))) void xen_raw_printk(const char *fmt, ...); #else static inline void xen_console_resume(void) { } static inline void xen_raw_console_write(const char *str) { } -static inline void xen_raw_printk(const char *fmt, ...) { } +static inline __attribute__((format(printf, 1, 2))) +void xen_raw_printk(const char *fmt, ...) { } #endif #endif /* XEN_HVC_CONSOLE_H */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 70213b4515eb..6acd9cefd517 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -450,6 +450,45 @@ struct start_info { int8_t cmd_line[MAX_GUEST_CMDLINE]; }; +struct dom0_vga_console_info { + uint8_t video_type; +#define XEN_VGATYPE_TEXT_MODE_3 0x03 +#define XEN_VGATYPE_VESA_LFB 0x23 + + union { + struct { + /* Font height, in pixels. */ + uint16_t font_height; + /* Cursor location (column, row). */ + uint16_t cursor_x, cursor_y; + /* Number of rows and columns (dimensions in characters). */ + uint16_t rows, columns; + } text_mode_3; + + struct { + /* Width and height, in pixels. */ + uint16_t width, height; + /* Bytes per scan line. */ + uint16_t bytes_per_line; + /* Bits per pixel. */ + uint16_t bits_per_pixel; + /* LFB physical address, and size (in units of 64kB). */ + uint32_t lfb_base; + uint32_t lfb_size; + /* RGB mask offsets and sizes, as defined by VBE 1.2+ */ + uint8_t red_pos, red_size; + uint8_t green_pos, green_size; + uint8_t blue_pos, blue_size; + uint8_t rsvd_pos, rsvd_size; + + /* VESA capabilities (offset 0xa, VESA command 0x4f00). */ + uint32_t gbl_caps; + /* Mode attributes (offset 0x0, VESA command 0x4f01). */ + uint16_t mode_attrs; + } vesa_lfb; + } u; +}; + /* These flags are passed in the 'flags' field of start_info_t. */ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ diff --git a/include/xen/tmem.h b/include/xen/tmem.h new file mode 100644 index 000000000000..82e2c83a32f5 --- /dev/null +++ b/include/xen/tmem.h @@ -0,0 +1,5 @@ +#ifndef _XEN_TMEM_H +#define _XEN_TMEM_H +/* defined in drivers/xen/tmem.c */ +extern int tmem_enabled; +#endif /* _XEN_TMEM_H */ diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 5467369e0889..aceeca799fd7 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -223,7 +223,9 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port); enum xenbus_state xenbus_read_driver_state(const char *path); +__attribute__((format(printf, 3, 4))) void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...); +__attribute__((format(printf, 3, 4))) void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...); const char *xenbus_strstate(enum xenbus_state state); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 75113cb7c4fb..8aafbb80b8b0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_data *rdp; struct rcu_node *rnp; - if (t->rcu_read_lock_nesting && + if (t->rcu_read_lock_nesting > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu) rnp->gp_tasks = &t->rcu_node_entry; } raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; + /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ + if (t->rcu_boosted) { + special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 0; + } #endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; @@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; rt_mutex_unlock(t->rcu_boost_mutex); t->rcu_boost_mutex = NULL; } @@ -387,13 +400,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu) rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } @@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; + } if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&sync_rcu_preempt_exp_wq); break; } @@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ diff --git a/kernel/sched.c b/kernel/sched.c index 3dc716f6d8ad..fde6ff903525 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2563,9 +2559,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) @@ -6557,7 +6589,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6581,9 +6613,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6774,11 +6806,39 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6945,6 +7005,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6964,15 +7025,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6981,24 +7100,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -7006,6 +7125,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7020,7 +7145,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7037,6 +7162,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7051,12 +7178,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); + + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7177,15 +7309,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - } + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } #ifdef CONFIG_SCHED_SMT @@ -7210,7 +7342,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7234,9 +7366,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7251,6 +7388,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7266,11 +7410,15 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } @@ -7316,8 +7464,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } while (sd->child) sd = sd->child; @@ -7329,13 +7482,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8f..c768588e180b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; if (local_group) { this_load = avg_load; @@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->cpu_power_orig = power; + sdg->sgp->power_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->cpu_power = power; + sdg->sgp->power = power; } static void update_group_power(struct sched_domain *sd, int cpu) @@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->cpu_power; + power += group->sgp->power; group = group->next; } while (group != child->groups); - sdg->cpu_power = power; + sdg->sgp->power = power; } /* @@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * If ~90% of the cpu_power is still there, we're good. */ - if (group->cpu_power * 32 > group->cpu_power_orig * 29) + if (group->sgp->power * 32 > group->sgp->power_orig * 29) return 1; return 0; @@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; /* * Consider the group unbalanced when the imbalance is larger @@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); @@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += sg->cpu_power; + sds->total_pwr += sg->sgp->power; /* * In case the child domain prefers tasks go to siblings @@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, if (this_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->cpu_power; + scaled_busy_load_per_task /= sds->busiest->sgp->power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->cpu_power * + pwr_now += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->cpu_power * + pwr_now += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->cpu_power; + sds->busiest->sgp->power; if (sds->max_load > tmp) - pwr_move += sds->busiest->cpu_power * + pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->cpu_power < + if (sds->max_load * sds->busiest->sgp->power < sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->cpu_power) / - sds->this->cpu_power; + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; else tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->cpu_power; - pwr_move += sds->this->cpu_power * + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->cpu_power; + load_above_capacity /= sds->busiest->sgp->power; } /* @@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->cpu_power, - (sds->avg_load - sds->this_load) * sds->this->cpu_power) + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee1..1e7066d76c26 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1) * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, 1) + +SCHED_FEAT(FORCE_SD_OVERLAP, 0) diff --git a/kernel/signal.c b/kernel/signal.c index ff7678603328..415d85d6f6c6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1178,18 +1178,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; - rcu_read_lock(); for (;;) { + local_irq_save(*flags); + rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); + local_irq_restore(*flags); break; + } - spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) + spin_lock(&sighand->siglock); + if (likely(sighand == tsk->sighand)) { + rcu_read_unlock(); break; - spin_unlock_irqrestore(&sighand->siglock, *flags); + } + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + local_irq_restore(*flags); } - rcu_read_unlock(); return sighand; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 40cf63ddd4b3..fca82c32042b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -315,16 +315,24 @@ static inline void invoke_softirq(void) { if (!force_irqthreads) __do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #else static inline void invoke_softirq(void) { if (!force_irqthreads) do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #endif diff --git a/mm/slab.c b/mm/slab.c index d96e223de775..1e523ed47c61 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ +static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache cache_cache = { + .nodelists = cache_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void) cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* - * struct kmem_cache size depends on nr_node_ids, which - * can be less than MAX_NUMNODES. + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + - nr_node_ids * sizeof(struct kmem_list3 *); + cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *); #if DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif @@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep) goto oops; + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; #if DEBUG cachep->obj_size = size; @@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -#if ARCH_SLAB_MINALIGN - if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + if (ARCH_SLAB_MINALIGN && + ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", - objp, ARCH_SLAB_MINALIGN); + objp, (int)ARCH_SLAB_MINALIGN); } -#endif return objp; } #else diff --git a/mm/slob.c b/mm/slob.c index 46e0aee33a23..0ae881831ae2 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; + gfp &= gfp_allowed_mask; + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { @@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, diff --git a/mm/slub.c b/mm/slub.c index 35f351f26193..ba83f3fd0757 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches); /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t) printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) { while (bytes) { - if (*start != (u8)value) + if (*start != value) return start; start++; bytes--; @@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) return NULL; } +static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) +{ + u64 value64; + unsigned int words, prefix; + + if (bytes <= 16) + return check_bytes8(start, value, bytes); + + value64 = value | value << 8 | value << 16 | value << 24; + value64 = value64 | value64 << 32; + prefix = 8 - ((unsigned long)start) % 8; + + if (prefix) { + u8 *r = check_bytes8(start, value, prefix); + if (r) + return r; + start += prefix; + bytes -= prefix; + } + + words = bytes / 8; + + while (words) { + if (*(u64 *)start != value64) + return check_bytes8(start, value, 8); + start += 8; + words--; + } + + return check_bytes8(start, value, bytes % 8); +} + static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -2928,6 +2993,42 @@ size_t ksize(const void *object) } EXPORT_SYMBOL(ksize); +#ifdef CONFIG_SLUB_DEBUG +bool verify_mem_not_deleted(const void *x) +{ + struct page *page; + void *object = (void *)x; + unsigned long flags; + bool rv; + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return false; + + local_irq_save(flags); + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + /* maybe it was from stack? */ + rv = true; + goto out_unlock; + } + + slab_lock(page); + if (on_freelist(page->slab, page, object)) { + object_err(page->slab, page, object, "Object is on free-list"); + rv = false; + } else { + rv = true; + } + slab_unlock(page); + +out_unlock: + local_irq_restore(flags); + return rv; +} +EXPORT_SYMBOL(verify_mem_not_deleted); +#endif + void kfree(const void *x) { struct page *page;