samples: add Terra samples

source: https://github.com/zdevito/terra/tree/master/tests
2026-01-14 13:25:50 +00:00 · 2016-01-28 11:22:27 +07:00
parent e32a837fb2
commit 9b8b39f444
370 changed files with 11921 additions and 0 deletions
--- a/samples/Terra/classifyfloatstructs.t
+++ b/samples/Terra/classifyfloatstructs.t
@@ -0,0 +1,137 @@
+-- reading and returning by reference produces far more compact assembly
+-- in terra than reading and returning by value, even when a by-value function
+-- is embedded in a by-reference wrapper. Also, optimization behavior varies
+-- wildly. Here's a demonstration:
+ 
+local float4 = tuple(float,float,float,float)
+ 
+-- first, comparing assembly code size of stand-alone functions which all do 
+-- the same operation, in different styles. This is after aggressive 
+-- optimization by the LLVM backend
+--------------------------------------------------------------------------------
+ 
+-- read by value, return by value
+-- 26 instructions, 120 bytes
+local terra add4_val_val(a : float4, b : float4) : float4
+    return {
+        a._0+b._0,
+        a._1+b._1,
+        a._2+b._2,
+        a._3+b._3
+    }
+end
+ 
+-- read by value, return by reference
+-- 24 instructions, 103 bytes
+local terra add4_val_ref(c : &float4, a : float4, b: float4) : {}
+    c._0 = a._0 + b._0
+    c._1 = a._1 + b._1
+    c._2 = a._2 + b._2
+    c._3 = a._3 + b._3
+end
+ 
+-- read by reference, return by value
+-- 14 instructions, 74 bytes
+local terra add4_ref_val(a : &float4, b : &float4) : float4
+    return {
+        a._0+b._0,
+        a._1+b._1,
+        a._2+b._2,
+        a._3+b._3
+    }
+end
+ 
+-- read by reference, return by reference
+-- 12 instructions, 57 bytes
+local terra add4_ref_ref(c : &float4, a : &float4, b: &float4) : {}
+    c._0 = a._0 + b._0
+    c._1 = a._1 + b._1
+    c._2 = a._2 + b._2
+    c._3 = a._3 + b._3
+end
+ 
+-- read by reference, return by reference, BUT use temporary variables
+-- 4 instructions, 12 bytes (!!!)
+-- what happens here is that the tempvars are understood and treated
+-- as a single SIMD register, and so only one addition is executed.
+-- this is already reflected in the bytecode passed to LLVM, so I suppose 
+-- terra does this optimization.
+local terra add4_ref_ref_tempvar(c : &float4, a : &float4, b: &float4) : {}
+    var x = a._0 + b._0
+    var y = a._1 + b._1
+    var z = a._2 + b._2
+    var w = a._3 + b._3
+    c._0 = x
+    c._1 = y
+    c._2 = z
+    c._3 = w
+end
+ 
+-- turn on always-inline for later
+add4_val_val:setinlined(true)
+add4_val_ref:setinlined(true)
+add4_ref_val:setinlined(true)
+add4_ref_ref:setinlined(true)
+ 
+-- uncomment to look at individual LLVM bytecode & disassembly
+add4_val_val:disas()
+-- add4_val_ref:disas()
+-- add4_ref_val:disas()
+-- add4_ref_ref:disas()
+-- add4_ref_ref_tempvar:disas()
+ 
+
+if terralib.lookupsymbol and require("ffi").os ~= "Windows" then
+    terra sizecheck() 
+     var si : terralib.SymbolInfo
+     terralib.lookupsymbol(add4_val_val,&si)
+     return si.size
+    end
+    assert(sizecheck() < 16)
+end
+--------------------------------------------------------------------------------
+ 
+-- up to this point, one could argue that functions are always inlined into 
+-- other functions, and that the instructions dissolve in the greater scheme
+-- of things.
+ 
+-- if that is true, let's attempt to convert one style into another and see
+-- if the optimizations catch up.
+ 
+-- read by value, return by value -> read by reference, return by reference
+-- the clunky solution to the slim interface
+-- 38 instructions (!), 193 bytes (!!), so it gets *worse*
+local terra add4_val_val_to_ref_ref(c : &float4, a : &float4, b: &float4) : {}
+    @c = add4_val_val(@a, @b)
+end
+ 
+-- read by reference, return by reference -> read by value, return by value
+-- the slim solution to the clunky interface
+-- 13 instructions, 61 bytes with tempvar -- wow, that's better than
+-- the original function with the actual code in it!
+local terra add4_ref_ref_to_val_val(a : float4, b : float4) : float4
+    var c : float4
+    add4_ref_ref_tempvar(&c, &a, &b)
+    return c
+end
+ 
+-- so what happens if we do a conversion back to the by-reference interface?
+-- 41 instructions, 194 bytes
+local terra add4_ref_ref_to_val_val_to_ref_ref(c : &float4, a : &float4, b: &float4) : {}
+    @c = add4_ref_ref_to_val_val(@a, @b)
+end
+ 
+-- and nest it once more, back to the by-value interface
+-- 47 instructions, 208 bytes
+-- so once we pass structs by-value, we'll never recover.
+local terra add4_ref_ref_to_val_val_to_ref_ref_to_val_val(a : float4, b : float4) : float4
+    var c : float4
+    add4_ref_ref_to_val_val_to_ref_ref(&c, &a, &b)
+    return c
+end
+ 
+-- uncomment to look at individual disassembly
+-- add4_val_val_to_ref_ref:disas()
+-- add4_ref_ref_to_val_val:disas()
+-- add4_ref_ref_to_val_val_to_ref_ref:disas()
+-- add4_ref_ref_to_val_val_to_ref_ref_to_val_val:disas()