From be7415810281b9ababe5fcec34236a0536af5939 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 15 Jun 2016 14:17:06 +0200 Subject: [PATCH] kernel: replace SMP cacheflush fix with some patches from linux-mti.git Signed-off-by: Felix Fietkau --- ...e-IPI-calls-for-CM-indexed-cache-ops.patch | 317 ++++++++++++++++++ ...4k-Exclude-sibling-CPUs-in-SMP-calls.patch | 37 ++ .../100-MIPS-fix-MT_SMP-cacheflush.patch | 17 - 3 files changed, 354 insertions(+), 17 deletions(-) create mode 100644 target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch create mode 100644 target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch delete mode 100644 target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch diff --git a/target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch b/target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch new file mode 100644 index 0000000000..0c1c0a4509 --- /dev/null +++ b/target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch @@ -0,0 +1,317 @@ +From: James Hogan +Date: Mon, 25 Jan 2016 21:30:00 +0000 +Subject: [PATCH] MIPS: c-r4k: Use IPI calls for CM indexed cache ops + +The Coherence Manager (CM) can propagate address-based ("hit") cache +operations to other cores in the coherent system, alleviating software +of the need to use IPI calls, however indexed cache operations are not +propagated since doing so makes no sense for separate caches. + +r4k_on_each_cpu() previously had a special case for CONFIG_MIPS_MT_SMP, +intended to avoid the IPIs when the only other CPUs in the system were +other VPEs in the same core, and hence sharing the same caches. This was +changed by commit cccf34e9411c ("MIPS: c-r4k: Fix cache flushing for MT +cores") to apparently handle multi-core multi-VPE systems, but it +focussed mainly on hit cache ops, so the IPI calls were still disabled +entirely for CM systems. + +This doesn't normally cause problems, but tests can be written to hit +these corner cases by using multiple threads, or changing task +affinities to force the process to migrate cores. For example the +failure of mprotect RW->RX to globally sync icaches (via +flush_cache_range) can be detected by modifying and mprotecting a code +page on one core, and migrating to a different core to execute from it. + +Most of the functions called by r4k_on_each_cpu() perform cache +operations exclusively with a single addressing-type (virtual address vs +indexed), so add a type argument and modify the callers to pass in +R4K_USER (user virtual addressing), R4K_KERN (global kernel virtual +addressing) or R4K_INDEX (index into cache). + +local_r4k_flush_icache_range() is split up, to allow it to be called +from the rest of the kernel, or from r4k_flush_icache_range() where it +will choose either indexed or hit cache operations based on the size of +the range and the cache sizes. + +local_r4k_flush_kernel_vmap_range() is split into two functions, each of +which uses cache operations with a single addressing-type, with +r4k_flush_kernel_vmap_range() making the decision whether to use indexed +cache ops or not. + +Signed-off-by: James Hogan +Cc: Ralf Baechle +Cc: Paul Burton +Cc: Leonid Yegoshin +Cc: linux-mips@linux-mips.org +--- + +--- a/arch/mips/mm/c-r4k.c ++++ b/arch/mips/mm/c-r4k.c +@@ -40,6 +40,50 @@ + #include + + /* ++ * Bits describing what cache ops an IPI callback function may perform. ++ * ++ * R4K_USER - Virtual user address based cache operations. ++ * Ineffective on other CPUs. ++ * R4K_KERN - Virtual kernel address based cache operations (including kmap). ++ * Effective on other CPUs. ++ * R4K_INDEX - Index based cache operations. ++ * Effective on other CPUs. ++ */ ++ ++#define R4K_USER BIT(0) ++#define R4K_KERN BIT(1) ++#define R4K_INDEX BIT(2) ++ ++#ifdef CONFIG_SMP ++/* The Coherence manager propagates address-based cache ops to other cores */ ++#define r4k_hit_globalized mips_cm_present() ++#define r4k_index_globalized 0 ++#else ++/* If there's only 1 CPU, then all cache ops are globalized to that 1 CPU */ ++#define r4k_hit_globalized 1 ++#define r4k_index_globalized 1 ++#endif ++ ++/** ++ * r4k_op_needs_ipi() - Decide if a cache op needs to be done on every core. ++ * @type: Type of cache operations (R4K_USER, R4K_KERN or R4K_INDEX). ++ * ++ * Returns: 1 if the cache operation @type should be done on every core in ++ * the system. ++ * 0 if the cache operation @type is globalized and only needs to ++ * be performed on a simple CPU. ++ */ ++static inline bool r4k_op_needs_ipi(unsigned int type) ++{ ++ /* ++ * If hardware doesn't globalize the required cache ops we must use IPIs ++ * to do so. ++ */ ++ return (type & R4K_KERN && !r4k_hit_globalized) || ++ (type & R4K_INDEX && !r4k_index_globalized); ++} ++ ++/* + * Special Variant of smp_call_function for use by cache functions: + * + * o No return value +@@ -48,19 +92,11 @@ + * primary cache. + * o doesn't disable interrupts on the local CPU + */ +-static inline void r4k_on_each_cpu(void (*func) (void *info), void *info) ++static inline void r4k_on_each_cpu(unsigned int type, ++ void (*func) (void *info), void *info) + { + preempt_disable(); +- +- /* +- * The Coherent Manager propagates address-based cache ops to other +- * cores but not index-based ops. However, r4k_on_each_cpu is used +- * in both cases so there is no easy way to tell what kind of op is +- * executed to the other cores. The best we can probably do is +- * to restrict that call when a CM is not present because both +- * CM-based SMP protocols (CMP & CPS) restrict index-based cache ops. +- */ +- if (!mips_cm_present()) ++ if (r4k_op_needs_ipi(type)) + smp_call_function_many(&cpu_foreign_map, func, info, 1); + func(info); + preempt_enable(); +@@ -456,7 +492,7 @@ static inline void local_r4k___flush_cac + + static void r4k___flush_cache_all(void) + { +- r4k_on_each_cpu(local_r4k___flush_cache_all, NULL); ++ r4k_on_each_cpu(R4K_INDEX, local_r4k___flush_cache_all, NULL); + } + + static inline int has_valid_asid(const struct mm_struct *mm) +@@ -503,7 +539,7 @@ static void r4k_flush_cache_range(struct + int exec = vma->vm_flags & VM_EXEC; + + if (cpu_has_dc_aliases || (exec && !cpu_has_ic_fills_f_dc)) +- r4k_on_each_cpu(local_r4k_flush_cache_range, vma); ++ r4k_on_each_cpu(R4K_INDEX, local_r4k_flush_cache_range, vma); + } + + static inline void local_r4k_flush_cache_mm(void * args) +@@ -535,7 +571,7 @@ static void r4k_flush_cache_mm(struct mm + if (!cpu_has_dc_aliases) + return; + +- r4k_on_each_cpu(local_r4k_flush_cache_mm, mm); ++ r4k_on_each_cpu(R4K_INDEX, local_r4k_flush_cache_mm, mm); + } + + struct flush_cache_page_args { +@@ -629,7 +665,7 @@ static void r4k_flush_cache_page(struct + args.addr = addr; + args.pfn = pfn; + +- r4k_on_each_cpu(local_r4k_flush_cache_page, &args); ++ r4k_on_each_cpu(R4K_KERN, local_r4k_flush_cache_page, &args); + } + + static inline void local_r4k_flush_data_cache_page(void * addr) +@@ -642,18 +678,23 @@ static void r4k_flush_data_cache_page(un + if (in_atomic()) + local_r4k_flush_data_cache_page((void *)addr); + else +- r4k_on_each_cpu(local_r4k_flush_data_cache_page, (void *) addr); ++ r4k_on_each_cpu(R4K_KERN, local_r4k_flush_data_cache_page, ++ (void *) addr); + } + + struct flush_icache_range_args { + unsigned long start; + unsigned long end; ++ unsigned int type; + }; + +-static inline void local_r4k_flush_icache_range(unsigned long start, unsigned long end) ++static inline void __local_r4k_flush_icache_range(unsigned long start, ++ unsigned long end, ++ unsigned int type) + { + if (!cpu_has_ic_fills_f_dc) { +- if (end - start >= dcache_size) { ++ if (type == R4K_INDEX || ++ (type & R4K_INDEX && end - start >= dcache_size)) { + r4k_blast_dcache(); + } else { + R4600_HIT_CACHEOP_WAR_IMPL; +@@ -661,7 +702,8 @@ static inline void local_r4k_flush_icach + } + } + +- if (end - start > icache_size) ++ if (type == R4K_INDEX || ++ (type & R4K_INDEX && end - start > icache_size)) + r4k_blast_icache(); + else { + switch (boot_cpu_type()) { +@@ -687,23 +729,59 @@ static inline void local_r4k_flush_icach + #endif + } + ++static inline void local_r4k_flush_icache_range(unsigned long start, ++ unsigned long end) ++{ ++ __local_r4k_flush_icache_range(start, end, R4K_KERN | R4K_INDEX); ++} ++ + static inline void local_r4k_flush_icache_range_ipi(void *args) + { + struct flush_icache_range_args *fir_args = args; + unsigned long start = fir_args->start; + unsigned long end = fir_args->end; ++ unsigned int type = fir_args->type; + +- local_r4k_flush_icache_range(start, end); ++ __local_r4k_flush_icache_range(start, end, type); + } + + static void r4k_flush_icache_range(unsigned long start, unsigned long end) + { + struct flush_icache_range_args args; ++ unsigned long size, cache_size; + + args.start = start; + args.end = end; ++ args.type = R4K_KERN | R4K_INDEX; + +- r4k_on_each_cpu(local_r4k_flush_icache_range_ipi, &args); ++ if (in_atomic()) { ++ /* ++ * We can't do blocking IPI calls from atomic context, so fall ++ * back to pure address-based cache ops if they globalize. ++ */ ++ if (!r4k_index_globalized && r4k_hit_globalized) { ++ args.type &= ~R4K_INDEX; ++ } else { ++ /* Just do it locally instead. */ ++ local_r4k_flush_icache_range(start, end); ++ instruction_hazard(); ++ return; ++ } ++ } else if (!r4k_index_globalized && r4k_hit_globalized) { ++ /* ++ * If address-based cache ops are globalized, then we may be ++ * able to avoid the IPI for small flushes. ++ */ ++ size = start - end; ++ cache_size = icache_size; ++ if (!cpu_has_ic_fills_f_dc) { ++ size *= 2; ++ cache_size += dcache_size; ++ } ++ if (size <= cache_size) ++ args.type &= ~R4K_INDEX; ++ } ++ r4k_on_each_cpu(args.type, local_r4k_flush_icache_range_ipi, &args); + instruction_hazard(); + } + +@@ -823,7 +901,12 @@ static void local_r4k_flush_cache_sigtra + + static void r4k_flush_cache_sigtramp(unsigned long addr) + { +- r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr); ++ /* ++ * FIXME this is a bit broken when !r4k_hit_globalized, since the user ++ * code probably won't be mapped on other CPUs, so if the process is ++ * migrated, it could end up hitting stale icache lines. ++ */ ++ r4k_on_each_cpu(R4K_USER, local_r4k_flush_cache_sigtramp, (void *)addr); + } + + static void r4k_flush_icache_all(void) +@@ -837,6 +920,15 @@ struct flush_kernel_vmap_range_args { + int size; + }; + ++static inline void local_r4k_flush_kernel_vmap_range_index(void *args) ++{ ++ /* ++ * Aliases only affect the primary caches so don't bother with ++ * S-caches or T-caches. ++ */ ++ r4k_blast_dcache(); ++} ++ + static inline void local_r4k_flush_kernel_vmap_range(void *args) + { + struct flush_kernel_vmap_range_args *vmra = args; +@@ -847,12 +939,8 @@ static inline void local_r4k_flush_kerne + * Aliases only affect the primary caches so don't bother with + * S-caches or T-caches. + */ +- if (cpu_has_safe_index_cacheops && size >= dcache_size) +- r4k_blast_dcache(); +- else { +- R4600_HIT_CACHEOP_WAR_IMPL; +- blast_dcache_range(vaddr, vaddr + size); +- } ++ R4600_HIT_CACHEOP_WAR_IMPL; ++ blast_dcache_range(vaddr, vaddr + size); + } + + static void r4k_flush_kernel_vmap_range(unsigned long vaddr, int size) +@@ -862,7 +950,12 @@ static void r4k_flush_kernel_vmap_range( + args.vaddr = (unsigned long) vaddr; + args.size = size; + +- r4k_on_each_cpu(local_r4k_flush_kernel_vmap_range, &args); ++ if (cpu_has_safe_index_cacheops && size >= dcache_size) ++ r4k_on_each_cpu(R4K_INDEX, ++ local_r4k_flush_kernel_vmap_range_index, NULL); ++ else ++ r4k_on_each_cpu(R4K_KERN, local_r4k_flush_kernel_vmap_range, ++ &args); + } + + static inline void rm7k_erratum31(void) diff --git a/target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch b/target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch new file mode 100644 index 0000000000..8d5030c84b --- /dev/null +++ b/target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch @@ -0,0 +1,37 @@ +From: James Hogan +Date: Thu, 3 Mar 2016 21:30:42 +0000 +Subject: [PATCH] MIPS: c-r4k: Exclude sibling CPUs in SMP calls + +When performing SMP calls to foreign cores, exclude sibling CPUs from +the provided map, as we already handle the local core on the current +CPU. This prevents an IPI call from for example core 0, VPE 1 to VPE 0 +on the same core. + +Signed-off-by: James Hogan +Cc: Ralf Baechle +Cc: Paul Burton +Cc: linux-mips@linux-mips.org +--- + +--- a/arch/mips/mm/c-r4k.c ++++ b/arch/mips/mm/c-r4k.c +@@ -96,8 +96,17 @@ static inline void r4k_on_each_cpu(unsig + void (*func) (void *info), void *info) + { + preempt_disable(); +- if (r4k_op_needs_ipi(type)) +- smp_call_function_many(&cpu_foreign_map, func, info, 1); ++ /* cpu_foreign_map and cpu_sibling_map[] undeclared when !CONFIG_SMP */ ++#ifdef CONFIG_SMP ++ if (r4k_op_needs_ipi(type)) { ++ struct cpumask mask; ++ ++ /* exclude sibling CPUs */ ++ cpumask_andnot(&mask, &cpu_foreign_map, ++ &cpu_sibling_map[smp_processor_id()]); ++ smp_call_function_many(&mask, func, info, 1); ++ } ++#endif + func(info); + preempt_enable(); + } diff --git a/target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch b/target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch deleted file mode 100644 index 14a10ba147..0000000000 --- a/target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch +++ /dev/null @@ -1,17 +0,0 @@ -Fix crash on cache flush with the MT_SMP variant - -Signed-off-by: Felix Fietkau - ---- a/arch/mips/mm/c-r4k.c -+++ b/arch/mips/mm/c-r4k.c -@@ -60,8 +60,10 @@ static inline void r4k_on_each_cpu(void - * to restrict that call when a CM is not present because both - * CM-based SMP protocols (CMP & CPS) restrict index-based cache ops. - */ -+#ifndef CONFIG_MIPS_MT_SMP - if (!mips_cm_present()) - smp_call_function_many(&cpu_foreign_map, func, info, 1); -+#endif - func(info); - preempt_enable(); - } -- 2.30.2