kernel: replace SMP cacheflush fix with some patches from linux-mti.git
authorFelix Fietkau <nbd@nbd.name>
Wed, 15 Jun 2016 12:17:06 +0000 (14:17 +0200)
committerFelix Fietkau <nbd@nbd.name>
Wed, 15 Jun 2016 12:36:47 +0000 (14:36 +0200)
Signed-off-by: Felix Fietkau <nbd@nbd.name>
target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch [new file with mode: 0644]
target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch [new file with mode: 0644]
target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch [deleted file]

diff --git a/target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch b/target/linux/generic/patches-4.4/090-MIPS-c-r4k-Use-IPI-calls-for-CM-indexed-cache-ops.patch
new file mode 100644 (file)
index 0000000..0c1c0a4
--- /dev/null
@@ -0,0 +1,317 @@
+From: James Hogan <james.hogan@imgtec.com>
+Date: Mon, 25 Jan 2016 21:30:00 +0000
+Subject: [PATCH] MIPS: c-r4k: Use IPI calls for CM indexed cache ops
+
+The Coherence Manager (CM) can propagate address-based ("hit") cache
+operations to other cores in the coherent system, alleviating software
+of the need to use IPI calls, however indexed cache operations are not
+propagated since doing so makes no sense for separate caches.
+
+r4k_on_each_cpu() previously had a special case for CONFIG_MIPS_MT_SMP,
+intended to avoid the IPIs when the only other CPUs in the system were
+other VPEs in the same core, and hence sharing the same caches. This was
+changed by commit cccf34e9411c ("MIPS: c-r4k: Fix cache flushing for MT
+cores") to apparently handle multi-core multi-VPE systems, but it
+focussed mainly on hit cache ops, so the IPI calls were still disabled
+entirely for CM systems.
+
+This doesn't normally cause problems, but tests can be written to hit
+these corner cases by using multiple threads, or changing task
+affinities to force the process to migrate cores. For example the
+failure of mprotect RW->RX to globally sync icaches (via
+flush_cache_range) can be detected by modifying and mprotecting a code
+page on one core, and migrating to a different core to execute from it.
+
+Most of the functions called by r4k_on_each_cpu() perform cache
+operations exclusively with a single addressing-type (virtual address vs
+indexed), so add a type argument and modify the callers to pass in
+R4K_USER (user virtual addressing), R4K_KERN (global kernel virtual
+addressing) or R4K_INDEX (index into cache).
+
+local_r4k_flush_icache_range() is split up, to allow it to be called
+from the rest of the kernel, or from r4k_flush_icache_range() where it
+will choose either indexed or hit cache operations based on the size of
+the range and the cache sizes.
+
+local_r4k_flush_kernel_vmap_range() is split into two functions, each of
+which uses cache operations with a single addressing-type, with
+r4k_flush_kernel_vmap_range() making the decision whether to use indexed
+cache ops or not.
+
+Signed-off-by: James Hogan <james.hogan@imgtec.com>
+Cc: Ralf Baechle <ralf@linux-mips.org>
+Cc: Paul Burton <paul.burton@imgtec.com>
+Cc: Leonid Yegoshin <leonid.yegoshin@imgtec.com>
+Cc: linux-mips@linux-mips.org
+---
+
+--- a/arch/mips/mm/c-r4k.c
++++ b/arch/mips/mm/c-r4k.c
+@@ -40,6 +40,50 @@
+ #include <asm/mips-cm.h>
+ /*
++ * Bits describing what cache ops an IPI callback function may perform.
++ *
++ * R4K_USER   -       Virtual user address based cache operations.
++ *            Ineffective on other CPUs.
++ * R4K_KERN   -       Virtual kernel address based cache operations (including kmap).
++ *            Effective on other CPUs.
++ * R4K_INDEX -        Index based cache operations.
++ *            Effective on other CPUs.
++ */
++
++#define R4K_USER      BIT(0)
++#define R4K_KERN      BIT(1)
++#define R4K_INDEX     BIT(2)
++
++#ifdef CONFIG_SMP
++/* The Coherence manager propagates address-based cache ops to other cores */
++#define r4k_hit_globalized    mips_cm_present()
++#define r4k_index_globalized  0
++#else
++/* If there's only 1 CPU, then all cache ops are globalized to that 1 CPU */
++#define r4k_hit_globalized    1
++#define r4k_index_globalized  1
++#endif
++
++/**
++ * r4k_op_needs_ipi() - Decide if a cache op needs to be done on every core.
++ * @type:     Type of cache operations (R4K_USER, R4K_KERN or R4K_INDEX).
++ *
++ * Returns:   1 if the cache operation @type should be done on every core in
++ *            the system.
++ *            0 if the cache operation @type is globalized and only needs to
++ *            be performed on a simple CPU.
++ */
++static inline bool r4k_op_needs_ipi(unsigned int type)
++{
++      /*
++       * If hardware doesn't globalize the required cache ops we must use IPIs
++       * to do so.
++       */
++      return (type & R4K_KERN  && !r4k_hit_globalized) ||
++             (type & R4K_INDEX && !r4k_index_globalized);
++}
++
++/*
+  * Special Variant of smp_call_function for use by cache functions:
+  *
+  *  o No return value
+@@ -48,19 +92,11 @@
+  *    primary cache.
+  *  o doesn't disable interrupts on the local CPU
+  */
+-static inline void r4k_on_each_cpu(void (*func) (void *info), void *info)
++static inline void r4k_on_each_cpu(unsigned int type,
++                                 void (*func) (void *info), void *info)
+ {
+       preempt_disable();
+-
+-      /*
+-       * The Coherent Manager propagates address-based cache ops to other
+-       * cores but not index-based ops. However, r4k_on_each_cpu is used
+-       * in both cases so there is no easy way to tell what kind of op is
+-       * executed to the other cores. The best we can probably do is
+-       * to restrict that call when a CM is not present because both
+-       * CM-based SMP protocols (CMP & CPS) restrict index-based cache ops.
+-       */
+-      if (!mips_cm_present())
++      if (r4k_op_needs_ipi(type))
+               smp_call_function_many(&cpu_foreign_map, func, info, 1);
+       func(info);
+       preempt_enable();
+@@ -456,7 +492,7 @@ static inline void local_r4k___flush_cac
+ static void r4k___flush_cache_all(void)
+ {
+-      r4k_on_each_cpu(local_r4k___flush_cache_all, NULL);
++      r4k_on_each_cpu(R4K_INDEX, local_r4k___flush_cache_all, NULL);
+ }
+ static inline int has_valid_asid(const struct mm_struct *mm)
+@@ -503,7 +539,7 @@ static void r4k_flush_cache_range(struct
+       int exec = vma->vm_flags & VM_EXEC;
+       if (cpu_has_dc_aliases || (exec && !cpu_has_ic_fills_f_dc))
+-              r4k_on_each_cpu(local_r4k_flush_cache_range, vma);
++              r4k_on_each_cpu(R4K_INDEX, local_r4k_flush_cache_range, vma);
+ }
+ static inline void local_r4k_flush_cache_mm(void * args)
+@@ -535,7 +571,7 @@ static void r4k_flush_cache_mm(struct mm
+       if (!cpu_has_dc_aliases)
+               return;
+-      r4k_on_each_cpu(local_r4k_flush_cache_mm, mm);
++      r4k_on_each_cpu(R4K_INDEX, local_r4k_flush_cache_mm, mm);
+ }
+ struct flush_cache_page_args {
+@@ -629,7 +665,7 @@ static void r4k_flush_cache_page(struct
+       args.addr = addr;
+       args.pfn = pfn;
+-      r4k_on_each_cpu(local_r4k_flush_cache_page, &args);
++      r4k_on_each_cpu(R4K_KERN, local_r4k_flush_cache_page, &args);
+ }
+ static inline void local_r4k_flush_data_cache_page(void * addr)
+@@ -642,18 +678,23 @@ static void r4k_flush_data_cache_page(un
+       if (in_atomic())
+               local_r4k_flush_data_cache_page((void *)addr);
+       else
+-              r4k_on_each_cpu(local_r4k_flush_data_cache_page, (void *) addr);
++              r4k_on_each_cpu(R4K_KERN, local_r4k_flush_data_cache_page,
++                              (void *) addr);
+ }
+ struct flush_icache_range_args {
+       unsigned long start;
+       unsigned long end;
++      unsigned int type;
+ };
+-static inline void local_r4k_flush_icache_range(unsigned long start, unsigned long end)
++static inline void __local_r4k_flush_icache_range(unsigned long start,
++                                                unsigned long end,
++                                                unsigned int type)
+ {
+       if (!cpu_has_ic_fills_f_dc) {
+-              if (end - start >= dcache_size) {
++              if (type == R4K_INDEX ||
++                  (type & R4K_INDEX && end - start >= dcache_size)) {
+                       r4k_blast_dcache();
+               } else {
+                       R4600_HIT_CACHEOP_WAR_IMPL;
+@@ -661,7 +702,8 @@ static inline void local_r4k_flush_icach
+               }
+       }
+-      if (end - start > icache_size)
++      if (type == R4K_INDEX ||
++          (type & R4K_INDEX && end - start > icache_size))
+               r4k_blast_icache();
+       else {
+               switch (boot_cpu_type()) {
+@@ -687,23 +729,59 @@ static inline void local_r4k_flush_icach
+ #endif
+ }
++static inline void local_r4k_flush_icache_range(unsigned long start,
++                                              unsigned long end)
++{
++      __local_r4k_flush_icache_range(start, end, R4K_KERN | R4K_INDEX);
++}
++
+ static inline void local_r4k_flush_icache_range_ipi(void *args)
+ {
+       struct flush_icache_range_args *fir_args = args;
+       unsigned long start = fir_args->start;
+       unsigned long end = fir_args->end;
++      unsigned int type = fir_args->type;
+-      local_r4k_flush_icache_range(start, end);
++      __local_r4k_flush_icache_range(start, end, type);
+ }
+ static void r4k_flush_icache_range(unsigned long start, unsigned long end)
+ {
+       struct flush_icache_range_args args;
++      unsigned long size, cache_size;
+       args.start = start;
+       args.end = end;
++      args.type = R4K_KERN | R4K_INDEX;
+-      r4k_on_each_cpu(local_r4k_flush_icache_range_ipi, &args);
++      if (in_atomic()) {
++              /*
++               * We can't do blocking IPI calls from atomic context, so fall
++               * back to pure address-based cache ops if they globalize.
++               */
++              if (!r4k_index_globalized && r4k_hit_globalized) {
++                      args.type &= ~R4K_INDEX;
++              } else {
++                      /* Just do it locally instead. */
++                      local_r4k_flush_icache_range(start, end);
++                      instruction_hazard();
++                      return;
++              }
++      } else if (!r4k_index_globalized && r4k_hit_globalized) {
++              /*
++               * If address-based cache ops are globalized, then we may be
++               * able to avoid the IPI for small flushes.
++               */
++              size = start - end;
++              cache_size = icache_size;
++              if (!cpu_has_ic_fills_f_dc) {
++                      size *= 2;
++                      cache_size += dcache_size;
++              }
++              if (size <= cache_size)
++                      args.type &= ~R4K_INDEX;
++      }
++      r4k_on_each_cpu(args.type, local_r4k_flush_icache_range_ipi, &args);
+       instruction_hazard();
+ }
+@@ -823,7 +901,12 @@ static void local_r4k_flush_cache_sigtra
+ static void r4k_flush_cache_sigtramp(unsigned long addr)
+ {
+-      r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr);
++      /*
++       * FIXME this is a bit broken when !r4k_hit_globalized, since the user
++       * code probably won't be mapped on other CPUs, so if the process is
++       * migrated, it could end up hitting stale icache lines.
++       */
++      r4k_on_each_cpu(R4K_USER, local_r4k_flush_cache_sigtramp, (void *)addr);
+ }
+ static void r4k_flush_icache_all(void)
+@@ -837,6 +920,15 @@ struct flush_kernel_vmap_range_args {
+       int             size;
+ };
++static inline void local_r4k_flush_kernel_vmap_range_index(void *args)
++{
++      /*
++       * Aliases only affect the primary caches so don't bother with
++       * S-caches or T-caches.
++       */
++      r4k_blast_dcache();
++}
++
+ static inline void local_r4k_flush_kernel_vmap_range(void *args)
+ {
+       struct flush_kernel_vmap_range_args *vmra = args;
+@@ -847,12 +939,8 @@ static inline void local_r4k_flush_kerne
+        * Aliases only affect the primary caches so don't bother with
+        * S-caches or T-caches.
+        */
+-      if (cpu_has_safe_index_cacheops && size >= dcache_size)
+-              r4k_blast_dcache();
+-      else {
+-              R4600_HIT_CACHEOP_WAR_IMPL;
+-              blast_dcache_range(vaddr, vaddr + size);
+-      }
++      R4600_HIT_CACHEOP_WAR_IMPL;
++      blast_dcache_range(vaddr, vaddr + size);
+ }
+ static void r4k_flush_kernel_vmap_range(unsigned long vaddr, int size)
+@@ -862,7 +950,12 @@ static void r4k_flush_kernel_vmap_range(
+       args.vaddr = (unsigned long) vaddr;
+       args.size = size;
+-      r4k_on_each_cpu(local_r4k_flush_kernel_vmap_range, &args);
++      if (cpu_has_safe_index_cacheops && size >= dcache_size)
++              r4k_on_each_cpu(R4K_INDEX,
++                              local_r4k_flush_kernel_vmap_range_index, NULL);
++      else
++              r4k_on_each_cpu(R4K_KERN, local_r4k_flush_kernel_vmap_range,
++                              &args);
+ }
+ static inline void rm7k_erratum31(void)
diff --git a/target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch b/target/linux/generic/patches-4.4/091-MIPS-c-r4k-Exclude-sibling-CPUs-in-SMP-calls.patch
new file mode 100644 (file)
index 0000000..8d5030c
--- /dev/null
@@ -0,0 +1,37 @@
+From: James Hogan <james.hogan@imgtec.com>
+Date: Thu, 3 Mar 2016 21:30:42 +0000
+Subject: [PATCH] MIPS: c-r4k: Exclude sibling CPUs in SMP calls
+
+When performing SMP calls to foreign cores, exclude sibling CPUs from
+the provided map, as we already handle the local core on the current
+CPU. This prevents an IPI call from for example core 0, VPE 1 to VPE 0
+on the same core.
+
+Signed-off-by: James Hogan <james.hogan@imgtec.com>
+Cc: Ralf Baechle <ralf@linux-mips.org>
+Cc: Paul Burton <paul.burton@imgtec.com>
+Cc: linux-mips@linux-mips.org
+---
+
+--- a/arch/mips/mm/c-r4k.c
++++ b/arch/mips/mm/c-r4k.c
+@@ -96,8 +96,17 @@ static inline void r4k_on_each_cpu(unsig
+                                  void (*func) (void *info), void *info)
+ {
+       preempt_disable();
+-      if (r4k_op_needs_ipi(type))
+-              smp_call_function_many(&cpu_foreign_map, func, info, 1);
++      /* cpu_foreign_map and cpu_sibling_map[] undeclared when !CONFIG_SMP */
++#ifdef CONFIG_SMP
++      if (r4k_op_needs_ipi(type)) {
++              struct cpumask mask;
++
++              /* exclude sibling CPUs */
++              cpumask_andnot(&mask, &cpu_foreign_map,
++                             &cpu_sibling_map[smp_processor_id()]);
++              smp_call_function_many(&mask, func, info, 1);
++      }
++#endif
+       func(info);
+       preempt_enable();
+ }
diff --git a/target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch b/target/linux/generic/patches-4.4/100-MIPS-fix-MT_SMP-cacheflush.patch
deleted file mode 100644 (file)
index 14a10ba..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-Fix crash on cache flush with the MT_SMP variant
-
-Signed-off-by: Felix Fietkau <nbd@nbd.name>
-
---- a/arch/mips/mm/c-r4k.c
-+++ b/arch/mips/mm/c-r4k.c
-@@ -60,8 +60,10 @@ static inline void r4k_on_each_cpu(void
-        * to restrict that call when a CM is not present because both
-        * CM-based SMP protocols (CMP & CPS) restrict index-based cache ops.
-        */
-+#ifndef CONFIG_MIPS_MT_SMP
-       if (!mips_cm_present())
-               smp_call_function_many(&cpu_foreign_map, func, info, 1);
-+#endif
-       func(info);
-       preempt_enable();
- }