[toolchain/gcc]: backport fa526 optimization for gcc 4.5+
[openwrt/svn-archive/archive.git] / toolchain / gcc / patches / linaro / 995-fa526.patch
1 --- a/gcc/config/arm/arm-cores.def
2 +++ b/gcc/config/arm/arm-cores.def
3 @@ -74,6 +74,7 @@ ARM_CORE("strongarm", strongarm, 4,
4 ARM_CORE("strongarm110", strongarm110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
5 ARM_CORE("strongarm1100", strongarm1100, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
6 ARM_CORE("strongarm1110", strongarm1110, 4, FL_MODE26 | FL_LDSCHED | FL_STRONG, fastmul)
7 +ARM_CORE("fa526", fa526, 4, FL_LDSCHED, fastmul)
8
9 /* V4T Architecture Processors */
10 ARM_CORE("arm7tdmi", arm7tdmi, 4T, FL_CO_PROC , fastmul)
11 --- a/gcc/config/arm/arm.md
12 +++ b/gcc/config/arm/arm.md
13 @@ -435,7 +435,7 @@
14
15 (define_attr "generic_sched" "yes,no"
16 (const (if_then_else
17 - (ior (eq_attr "tune" "arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
18 + (ior (eq_attr "tune" "fa526,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4")
19 (eq_attr "tune_cortexr4" "yes"))
20 (const_string "no")
21 (const_string "yes"))))
22 @@ -467,6 +467,7 @@
23 (include "arm1020e.md")
24 (include "arm1026ejs.md")
25 (include "arm1136jfs.md")
26 +(include "fa526.md")
27 (include "cortex-a5.md")
28 (include "cortex-a8.md")
29 (include "cortex-a9.md")
30 --- a/gcc/config/arm/arm-tune.md
31 +++ b/gcc/config/arm/arm-tune.md
32 @@ -1,5 +1,5 @@
33 ;; -*- buffer-read-only: t -*-
34 ;; Generated automatically by gentune.sh from arm-cores.def
35 (define_attr "tune"
36 - "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
37 + "arm2,arm250,arm3,arm6,arm60,arm600,arm610,arm620,arm7,arm7d,arm7di,arm70,arm700,arm700i,arm710,arm720,arm710c,arm7100,arm7500,arm7500fe,arm7m,arm7dm,arm7dmi,arm8,arm810,strongarm,strongarm110,strongarm1100,strongarm1110,fa526,arm7tdmi,arm7tdmis,arm710t,arm720t,arm740t,arm9,arm9tdmi,arm920,arm920t,arm922t,arm940t,ep9312,arm10tdmi,arm1020t,arm9e,arm946es,arm966es,arm968es,arm10e,arm1020e,arm1022e,xscale,iwmmxt,iwmmxt2,arm926ejs,arm1026ejs,arm1136js,arm1136jfs,arm1176jzs,arm1176jzfs,mpcorenovfp,mpcore,arm1156t2s,arm1156t2fs,cortexa5,cortexa8,cortexa9,cortexr4,cortexr4f,cortexm4,cortexm3,cortexm1,cortexm0"
38 (const (symbol_ref "((enum attr_tune) arm_tune)")))
39 --- a/gcc/config/arm/bpabi.h
40 +++ b/gcc/config/arm/bpabi.h
41 @@ -52,7 +52,8 @@
42 /* The BPABI integer comparison routines return { -1, 0, 1 }. */
43 #define TARGET_LIB_INT_CMP_BIASED !TARGET_BPABI
44
45 -#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*|march=armv4:--fix-v4bx}"
46 +#define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*\
47 +|march=armv4|mcpu=fa526:--fix-v4bx}"
48
49 #define BE8_LINK_SPEC " %{mbig-endian:%{march=armv7-a|mcpu=cortex-a5|mcpu=cortex-a8|mcpu=cortex-a9:%{!r:--be8}}}"
50
51 --- /dev/null
52 +++ b/gcc/config/arm/fa526.md
53 @@ -0,0 +1,161 @@
54 +;; Faraday FA526 Pipeline Description
55 +;; Copyright (C) 2010 Free Software Foundation, Inc.
56 +;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
57 +
58 +;; This file is part of GCC.
59 +;;
60 +;; GCC is free software; you can redistribute it and/or modify it under
61 +;; the terms of the GNU General Public License as published by the Free
62 +;; Software Foundation; either version 3, or (at your option) any later
63 +;; version.
64 +;;
65 +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
66 +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
67 +;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
68 +;; for more details.
69 +;;
70 +;; You should have received a copy of the GNU General Public License
71 +;; along with GCC; see the file COPYING3. If not see
72 +;; <http://www.gnu.org/licenses/>. */
73 +
74 +;; These descriptions are based on the information contained in the
75 +;; FA526 Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
76 +;;
77 +;; Modeled pipeline characteristics:
78 +;; LD -> any use: latency = 3 (2 cycle penalty).
79 +;; ALU -> any use: latency = 2 (1 cycle penalty).
80 +
81 +;; This automaton provides a pipeline description for the Faraday
82 +;; FA526 core.
83 +;;
84 +;; The model given here assumes that the condition for all conditional
85 +;; instructions is "true", i.e., that all of the instructions are
86 +;; actually executed.
87 +
88 +(define_automaton "fa526")
89 +
90 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
91 +;; Pipelines
92 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
93 +
94 +;; There is a single pipeline
95 +;;
96 +;; The ALU pipeline has fetch, decode, execute, memory, and
97 +;; write stages. We only need to model the execute, memory and write
98 +;; stages.
99 +
100 +;; S E M W
101 +
102 +(define_cpu_unit "fa526_core" "fa526")
103 +
104 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
105 +;; ALU Instructions
106 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
107 +
108 +;; ALU instructions require two cycles to execute, and use the ALU
109 +;; pipeline in each of the three stages. The results are available
110 +;; after the execute stage stage has finished.
111 +;;
112 +;; If the destination register is the PC, the pipelines are stalled
113 +;; for several cycles. That case is not modeled here.
114 +
115 +;; ALU operations
116 +(define_insn_reservation "526_alu_op" 1
117 + (and (eq_attr "tune" "fa526")
118 + (eq_attr "type" "alu"))
119 + "fa526_core")
120 +
121 +(define_insn_reservation "526_alu_shift_op" 2
122 + (and (eq_attr "tune" "fa526")
123 + (eq_attr "type" "alu_shift,alu_shift_reg"))
124 + "fa526_core")
125 +
126 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
127 +;; Multiplication Instructions
128 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129 +
130 +(define_insn_reservation "526_mult1" 2
131 + (and (eq_attr "tune" "fa526")
132 + (eq_attr "insn" "smlalxy,smulxy,smlaxy,smlalxy"))
133 + "fa526_core")
134 +
135 +(define_insn_reservation "526_mult2" 5
136 + (and (eq_attr "tune" "fa526")
137 + (eq_attr "insn" "mul,mla,muls,mlas,umull,umlal,smull,smlal,umulls,\
138 + umlals,smulls,smlals,smlawx"))
139 + "fa526_core*4")
140 +
141 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
142 +;; Load/Store Instructions
143 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
144 +
145 +;; The models for load/store instructions do not accurately describe
146 +;; the difference between operations with a base register writeback
147 +;; (such as "ldm!"). These models assume that all memory references
148 +;; hit in dcache.
149 +
150 +(define_insn_reservation "526_load1_op" 3
151 + (and (eq_attr "tune" "fa526")
152 + (eq_attr "type" "load1,load_byte"))
153 + "fa526_core")
154 +
155 +(define_insn_reservation "526_load2_op" 4
156 + (and (eq_attr "tune" "fa526")
157 + (eq_attr "type" "load2"))
158 + "fa526_core*2")
159 +
160 +(define_insn_reservation "526_load3_op" 5
161 + (and (eq_attr "tune" "fa526")
162 + (eq_attr "type" "load3"))
163 + "fa526_core*3")
164 +
165 +(define_insn_reservation "526_load4_op" 6
166 + (and (eq_attr "tune" "fa526")
167 + (eq_attr "type" "load4"))
168 + "fa526_core*4")
169 +
170 +(define_insn_reservation "526_store1_op" 0
171 + (and (eq_attr "tune" "fa526")
172 + (eq_attr "type" "store1"))
173 + "fa526_core")
174 +
175 +(define_insn_reservation "526_store2_op" 1
176 + (and (eq_attr "tune" "fa526")
177 + (eq_attr "type" "store2"))
178 + "fa526_core*2")
179 +
180 +(define_insn_reservation "526_store3_op" 2
181 + (and (eq_attr "tune" "fa526")
182 + (eq_attr "type" "store3"))
183 + "fa526_core*3")
184 +
185 +(define_insn_reservation "526_store4_op" 3
186 + (and (eq_attr "tune" "fa526")
187 + (eq_attr "type" "store4"))
188 + "fa526_core*4")
189 +
190 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191 +;; Branch and Call Instructions
192 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193 +
194 +;; Branch instructions are difficult to model accurately. The FA526
195 +;; core can predict most branches. If the branch is predicted
196 +;; correctly, and predicted early enough, the branch can be completely
197 +;; eliminated from the instruction stream. Some branches can
198 +;; therefore appear to require zero cycle to execute. We assume that
199 +;; all branches are predicted correctly, and that the latency is
200 +;; therefore the minimum value.
201 +
202 +(define_insn_reservation "526_branch_op" 0
203 + (and (eq_attr "tune" "fa526")
204 + (eq_attr "type" "branch"))
205 + "fa526_core")
206 +
207 +;; The latency for a call is actually the latency when the result is available.
208 +;; i.e. R0 ready for int return value. For most cases, the return value is set
209 +;; by a mov instruction, which has 1 cycle latency.
210 +(define_insn_reservation "526_call_op" 1
211 + (and (eq_attr "tune" "fa526")
212 + (eq_attr "type" "call"))
213 + "fa526_core")
214 +
215 --- a/gcc/config/arm/t-arm
216 +++ b/gcc/config/arm/t-arm
217 @@ -24,6 +24,7 @@ MD_INCLUDES= $(srcdir)/config/arm/arm-t
218 $(srcdir)/config/arm/arm1020e.md \
219 $(srcdir)/config/arm/arm1026ejs.md \
220 $(srcdir)/config/arm/arm1136jfs.md \
221 + $(srcdir)/config/arm/fa526.md \
222 $(srcdir)/config/arm/arm926ejs.md \
223 $(srcdir)/config/arm/cirrus.md \
224 $(srcdir)/config/arm/fpa.md \
225 --- a/gcc/config/arm/t-arm-elf
226 +++ b/gcc/config/arm/t-arm-elf
227 @@ -36,6 +36,10 @@ MULTILIB_DIRNAMES = arm thumb
228 MULTILIB_EXCEPTIONS =
229 MULTILIB_MATCHES =
230
231 +#MULTILIB_OPTIONS += mcpu=fa526
232 +#MULTILIB_DIRNAMES += fa526
233 +#MULTILIB_EXCEPTIONS += *mthumb*/*mcpu=fa526
234 +
235 #MULTILIB_OPTIONS += march=armv7
236 #MULTILIB_DIRNAMES += thumb2
237 #MULTILIB_EXCEPTIONS += march=armv7* marm/*march=armv7*
238 @@ -52,6 +56,7 @@ MULTILIB_MATCHES =
239 MULTILIB_OPTIONS += mfloat-abi=hard
240 MULTILIB_DIRNAMES += fpu
241 MULTILIB_EXCEPTIONS += *mthumb/*mfloat-abi=hard*
242 +MULTILIB_EXCEPTIONS += *mcpu=fa526/*mfloat-abi=hard*
243
244 # MULTILIB_OPTIONS += mcpu=ep9312
245 # MULTILIB_DIRNAMES += ep9312
246 --- a/gcc/doc/invoke.texi
247 +++ b/gcc/doc/invoke.texi
248 @@ -9900,7 +9900,8 @@ assembly code. Permissible names are: @
249 @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-m4}, @samp{cortex-m3},
250 @samp{cortex-m1},
251 @samp{cortex-m0},
252 -@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312}.
253 +@samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2}, @samp{ep9312},
254 +@samp{fa526}.
255
256 @item -mtune=@var{name}
257 @opindex mtune