From bfcfaa77bdf0f775263e906015982a608df01c76 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 11:16:17 -0800 Subject: [PATCH] vfs: use 'unsigned long' accesses for dcache name comparison and hashing Ok, this is hacky, and only works on little-endian machines with goo unaligned handling. And even then only with CONFIG_DEBUG_PAGEALLOC disabled, since it can access up to 7 bytes after the pathname. But it runs like a bat out of hell. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + fs/Kconfig | 4 ++ fs/dcache.c | 23 +++++++++ fs/namei.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..09675d3e0ac3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP + select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9e..aa195265362f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@ menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + if BLOCK source "fs/ext2/Kconfig" diff --git a/fs/dcache.c b/fs/dcache.c index bcbdb33fcc20..ffd47a16d870 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, static inline int dentry_cmp(const unsigned char *cs, size_t scount, const unsigned char *ct, size_t tcount) { +#ifdef CONFIG_DCACHE_WORD_ACCESS + unsigned long a,b,mask; + + if (unlikely(scount != tcount)) + return 1; + + for (;;) { + a = *(unsigned long *)cs; + b = *(unsigned long *)ct; + if (tcount < sizeof(unsigned long)) + break; + if (unlikely(a != b)) + return 1; + cs += sizeof(unsigned long); + ct += sizeof(unsigned long); + tcount -= sizeof(unsigned long); + if (!tcount) + return 0; + } + mask = ~(~0ul << tcount*8); + return unlikely(!!((a ^ b) & mask)); +#else if (scount != tcount) return 1; @@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, tcount--; } while (tcount); return 0; +#endif } static void __d_free(struct rcu_head *head) diff --git a/fs/namei.c b/fs/namei.c index e2ba62820a0f..378497a744b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode) return 1; } +/* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + * do a "get_unaligned()" if this helps and is sufficiently + * fast. + * + * - Little-endian machines (so that we can generate the mask + * of low bytes efficiently). Again, we *could* do a byte + * swapping load on big-endian architectures if that is not + * expensive enough to make the optimization worthless. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + * do not trap on the (extremely unlikely) case of a page + * crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + * 64-bit case in order to generate the "number of bytes in + * the final mask". Again, that could be replaced with a + * efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608 >> 56; +} + +static inline unsigned int fold_hash(unsigned long hash) +{ + hash += hash >> (8*sizeof(int)); + return hash; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long a, mask; + unsigned long hash = 0; + + for (;;) { + a = *(unsigned long *)name; + hash *= 9; + if (len < sizeof(unsigned long)) + break; + hash += a; + name += sizeof(unsigned long); + len -= sizeof(unsigned long); + if (!len) + goto done; + } + mask = ~(~0ul << len*8); + hash += mask & a; +done: + return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +#define ONEBYTES 0x0101010101010101ul +#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful +#define HIGHBITS 0x8080808080808080ul + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - ONEBYTES) & ~a) & HIGHBITS; +} + +/* + * Calculate the length and hash of the path component, and + * return the length of the component; + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long a, mask, hash, len; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = (hash + a) * 9; + len += sizeof(unsigned long); + a = *(unsigned long *)(name+len); + /* Do we have any NUL or '/' bytes in this word? */ + mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + } while (!mask); + + /* The mask *below* the first high bit set */ + mask = (mask - 1) & ~mask; + mask >>= 7; + hash += a & mask; + *hashp = fold_hash(hash); + + return len + count_masked_bytes(mask); +} + +#else + unsigned int full_name_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); @@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) return len; } +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into