diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml
index 32143b7e..af5816a1 100644
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -347,6 +347,14 @@ Ceylon:
   extensions:
   - .ceylon
 
+Chapel:
+  type: programming
+  color: "#8dc63f"
+  aliases:
+  - chpl
+  extensions:
+  - .chpl
+
 ChucK:
   lexer: Java
   extensions:
diff --git a/lib/linguist/samples.json b/lib/linguist/samples.json
index f48ee225..979c18ab 100644
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
@@ -83,6 +83,9 @@
     "Ceylon": [
       ".ceylon"
     ],
+    "Chapel": [
+      ".chpl"
+    ],
     "Cirru": [
       ".cirru"
     ],
@@ -792,8 +795,8 @@
       "exception.zep.php"
     ]
   },
-  "tokens_total": 650233,
-  "languages_total": 870,
+  "tokens_total": 659840,
+  "languages_total": 875,
   "tokens": {
     "ABAP": {
       "*/**": 1,
@@ -16278,6 +16281,798 @@
       "<=>": 1,
       "other.name": 1
     },
+    "Chapel": {
+      "//": 150,
+      "use": 5,
+      "BlockDist": 2,
+      "CyclicDist": 1,
+      "BlockCycDist": 1,
+      "ReplicatedDist": 2,
+      ";": 516,
+      "DimensionalDist2D": 2,
+      "ReplicatedDim": 2,
+      "BlockCycDim": 1,
+      "config": 10,
+      "const": 59,
+      "n": 16,
+      "Space": 12,
+      "{": 122,
+      "}": 120,
+      "BlockSpace": 2,
+      "dmapped": 8,
+      "Block": 4,
+      "(": 626,
+      "boundingBox": 2,
+      ")": 626,
+      "var": 72,
+      "BA": 3,
+      "[": 920,
+      "]": 920,
+      "int": 21,
+      "forall": 43,
+      "ba": 4,
+      "in": 76,
+      "do": 62,
+      "here.id": 7,
+      "writeln": 53,
+      "MyLocaleView": 5,
+      "#numLocales": 1,
+      "MyLocales": 5,
+      "locale": 1,
+      "reshape": 2,
+      "Locales": 6,
+      "BlockSpace2": 2,
+      "targetLocales": 1,
+      "BA2": 3,
+      "CyclicSpace": 2,
+      "Cyclic": 1,
+      "startIdx": 2,
+      "Space.low": 5,
+      "CA": 3,
+      "ca": 2,
+      "BlkCycSpace": 2,
+      "BlockCyclic": 1,
+      "blocksize": 1,
+      "BCA": 3,
+      "bca": 2,
+      "ReplicatedSpace": 2,
+      "RA": 11,
+      "ra": 4,
+      "RA.numElements": 1,
+      "A": 13,
+      "i": 250,
+      "j": 25,
+      "i*100": 1,
+      "+": 334,
+      "on": 7,
+      "here": 3,
+      "LocaleSpace.high": 3,
+      "nl1": 2,
+      "nl2": 2,
+      "if": 98,
+      "numLocales": 5,
+      "then": 80,
+      "else": 16,
+      "numLocales/2": 1,
+      "#nl1": 2,
+      "#nl2": 1,
+      "#nl1*nl2": 1,
+      "DimReplicatedBlockcyclicSpace": 2,
+      "new": 7,
+      "BlockCyclicDim": 1,
+      "lowIdx": 1,
+      "blockSize": 1,
+      "DRBA": 3,
+      "for": 36,
+      "locId1": 2,
+      "drba": 2,
+      "Helper": 2,
+      "print": 5,
+      "to": 9,
+      "the": 10,
+      "console": 1,
+      "Time": 2,
+      "get": 3,
+      "timing": 6,
+      "routines": 1,
+      "benchmarking": 1,
+      "block": 1,
+      "-": 345,
+      "distributed": 1,
+      "arrays": 6,
+      "luleshInit": 1,
+      "initialization": 1,
+      "code": 1,
+      "data": 1,
+      "set": 4,
+      "param": 25,
+      "useBlockDist": 5,
+      "CHPL_COMM": 1,
+      "use3DRepresentation": 4,
+      "false": 4,
+      "useSparseMaterials": 3,
+      "true": 5,
+      "printWarnings": 3,
+      "&&": 9,
+      "luleshInit.filename": 1,
+      "halt": 5,
+      "initialEnergy": 2,
+      "e": 84,
+      "initial": 1,
+      "energy": 5,
+      "value": 1,
+      "showProgress": 3,
+      "time": 9,
+      "and": 4,
+      "dt": 14,
+      "values": 1,
+      "each": 1,
+      "step": 1,
+      "debug": 8,
+      "various": 1,
+      "info": 1,
+      "doTiming": 4,
+      "main": 3,
+      "timestep": 1,
+      "loop": 2,
+      "printCoords": 2,
+      "final": 1,
+      "computed": 1,
+      "coordinates": 2,
+      "XI_M": 2,
+      "XI_M_SYMM": 4,
+      "XI_M_FREE": 3,
+      "XI_P": 2,
+      "c": 7,
+      "XI_P_SYMM": 3,
+      "XI_P_FREE": 3,
+      "ETA_M": 2,
+      "ETA_M_SYMM": 4,
+      "ETA_M_FREE": 3,
+      "ETA_P": 2,
+      "c0": 1,
+      "ETA_P_SYMM": 3,
+      "ETA_P_FREE": 3,
+      "ZETA_M": 2,
+      "ZETA_M_SYMM": 4,
+      "ZETA_M_FREE": 3,
+      "ZETA_P": 2,
+      "xc00": 1,
+      "ZETA_P_SYMM": 3,
+      "ZETA_P_FREE": 3,
+      "numElems": 2,
+      "numNodes": 1,
+      "initProblemSize": 1,
+      "ElemSpace": 4,
+      "#elemsPerEdge": 3,
+      "#numElems": 1,
+      "NodeSpace": 4,
+      "#nodesPerEdge": 3,
+      "#numNodes": 1,
+      "Elems": 45,
+      "Nodes": 16,
+      "x": 111,
+      "y": 107,
+      "z": 107,
+      "real": 60,
+      "nodesPerElem": 6,
+      "elemToNode": 7,
+      "nodesPerElem*index": 1,
+      "lxim": 3,
+      "lxip": 3,
+      "letam": 3,
+      "letap": 3,
+      "lzetam": 3,
+      "lzetap": 3,
+      "index": 4,
+      "XSym": 4,
+      "YSym": 4,
+      "ZSym": 4,
+      "sparse": 3,
+      "subdomain": 3,
+      "u_cut": 5,
+      "hgcoef": 1,
+      "qstop": 2,
+      "monoq_max_slope": 7,
+      "monoq_limiter_mult": 7,
+      "e_cut": 3,
+      "p_cut": 6,
+      "ss4o3": 1,
+      "/3.0": 2,
+      "q_cut": 2,
+      "v_cut": 2,
+      "qlc_monoq": 2,
+      "qqc_monoq": 2,
+      "qqc": 1,
+      "qqc2": 1,
+      "*": 260,
+      "qqc**2": 1,
+      "eosvmax": 14,
+      "eosvmin": 9,
+      "pmin": 7,
+      "emin": 7,
+      "dvovmax": 1,
+      "refdens": 3,
+      "deltatimemultlb": 1,
+      "deltatimemultub": 1,
+      "dtmax": 1,
+      "stoptime": 3,
+      "maxcycles": 3,
+      "max": 2,
+      "dtfixed": 2,
+      "MatElems": 9,
+      "MatElemsType": 2,
+      "enumerateMatElems": 2,
+      "proc": 44,
+      "type": 1,
+      "return": 15,
+      "Elems.type": 1,
+      "iter": 3,
+      "yield": 3,
+      "elemBC": 16,
+      "p": 10,
+      "pressure": 1,
+      "q": 13,
+      "ql": 3,
+      "linear": 1,
+      "term": 2,
+      "qq": 3,
+      "quadratic": 1,
+      "v": 9,
+      "//relative": 1,
+      "volume": 21,
+      "vnew": 9,
+      "volo": 5,
+      "reference": 1,
+      "delv": 3,
+      "m_vnew": 1,
+      "m_v": 1,
+      "vdov": 4,
+      "derivative": 1,
+      "over": 2,
+      "arealg": 2,
+      "elem": 3,
+      "characteristic": 2,
+      "length": 2,
+      "ss": 2,
+      "elemMass": 4,
+      "mass": 12,
+      "xd": 8,
+      "yd": 8,
+      "zd": 8,
+      "velocities": 2,
+      "xdd": 3,
+      "ydd": 3,
+      "zdd": 3,
+      "acceleration": 1,
+      "fx": 8,
+      "fy": 8,
+      "fz": 8,
+      "atomic": 2,
+      "forces": 1,
+      "nodalMass": 3,
+      "current": 1,
+      "deltatime": 3,
+      "variable": 1,
+      "increment": 1,
+      "dtcourant": 1,
+      "e20": 2,
+      "courant": 1,
+      "constraint": 2,
+      "dthydro": 1,
+      "change": 2,
+      "cycle": 5,
+      "iteration": 1,
+      "count": 1,
+      "simulation": 1,
+      "initLulesh": 2,
+      "st": 4,
+      "getCurrentTime": 4,
+      "while": 4,
+      "<": 42,
+      "iterTime": 2,
+      "TimeIncrement": 2,
+      "LagrangeLeapFrog": 1,
+      "deprint": 3,
+      "format": 9,
+      "et": 3,
+      "/cycle": 1,
+      "outfile": 1,
+      "open": 1,
+      "iomode.cw": 1,
+      "writer": 1,
+      "outfile.writer": 1,
+      "fmtstr": 4,
+      "writer.writeln": 1,
+      "writer.close": 1,
+      "outfile.close": 1,
+      "initCoordinates": 1,
+      "initElemToNodeMapping": 1,
+      "initGreekVars": 1,
+      "initXSyms": 1,
+      "initYSyms": 1,
+      "initZSyms": 1,
+      "//calculated": 1,
+      "fly": 1,
+      "using": 1,
+      "elemToNodes": 3,
+      "initMasses": 2,
+      "octantCorner": 2,
+      "initBoundaryConditions": 2,
+      "massAccum": 3,
+      "eli": 18,
+      "x_local": 11,
+      "y_local": 11,
+      "z_local": 11,
+      "*real": 40,
+      "localizeNeighborNodes": 6,
+      "CalcElemVolume": 3,
+      "neighbor": 2,
+      ".add": 1,
+      ".read": 1,
+      "/": 26,
+      "surfaceNode": 8,
+      "mask": 16,
+      "<<": 2,
+      "&": 18,
+      "f": 4,
+      "|": 14,
+      "xf0": 4,
+      "xcc": 4,
+      "check": 3,
+      "loc": 4,
+      "maxloc": 1,
+      "reduce": 2,
+      "zip": 1,
+      "freeSurface": 3,
+      "initFreeSurface": 1,
+      "b": 4,
+      "inline": 5,
+      "ref": 19,
+      "noi": 4,
+      "TripleProduct": 4,
+      "x1": 1,
+      "y1": 1,
+      "z1": 1,
+      "x2": 1,
+      "y2": 1,
+      "z2": 1,
+      "x3": 1,
+      "y3": 1,
+      "z3": 1,
+      "x1*": 1,
+      "y2*z3": 1,
+      "z2*y3": 1,
+      "x2*": 1,
+      "z1*y3": 1,
+      "y1*z3": 1,
+      "x3*": 1,
+      "y1*z2": 1,
+      "z1*y2": 1,
+      "dx61": 2,
+      "dy61": 2,
+      "dz61": 2,
+      "dx70": 2,
+      "dy70": 2,
+      "dz70": 2,
+      "dx63": 2,
+      "dy63": 2,
+      "dz63": 2,
+      "dx20": 2,
+      "dy20": 2,
+      "dz20": 2,
+      "dx50": 2,
+      "dy50": 2,
+      "dz50": 2,
+      "dx64": 2,
+      "dy64": 2,
+      "dz64": 2,
+      "dx31": 2,
+      "dy31": 2,
+      "dz31": 2,
+      "dx72": 2,
+      "dy72": 2,
+      "dz72": 2,
+      "dx43": 2,
+      "dy43": 2,
+      "dz43": 2,
+      "dx57": 2,
+      "dy57": 2,
+      "dz57": 2,
+      "dx14": 2,
+      "dy14": 2,
+      "dz14": 2,
+      "dx25": 2,
+      "dy25": 2,
+      "dz25": 2,
+      "InitStressTermsForElems": 1,
+      "sigxx": 2,
+      "sigyy": 2,
+      "sigzz": 2,
+      "D": 9,
+      "CalcElemShapeFunctionDerivatives": 2,
+      "b_x": 18,
+      "b_y": 18,
+      "b_z": 18,
+      "fjxxi": 5,
+      ".125": 9,
+      "fjxet": 6,
+      "fjxze": 5,
+      "fjyxi": 5,
+      "fjyet": 6,
+      "fjyze": 5,
+      "fjzxi": 5,
+      "fjzet": 6,
+      "fjzze": 5,
+      "cjxxi": 5,
+      "cjxet": 6,
+      "cjxze": 5,
+      "cjyxi": 5,
+      "cjyet": 6,
+      "cjyze": 5,
+      "cjzxi": 5,
+      "cjzet": 6,
+      "cjzze": 5,
+      "CalcElemNodeNormals": 1,
+      "pfx": 15,
+      "pfy": 15,
+      "pfz": 15,
+      "ElemFaceNormal": 7,
+      "n1": 23,
+      "n2": 23,
+      "n3": 23,
+      "n4": 23,
+      "bisectX0": 3,
+      "bisectY0": 3,
+      "bisectZ0": 3,
+      "bisectX1": 3,
+      "bisectY1": 3,
+      "bisectZ1": 3,
+      "areaX": 5,
+      "areaY": 5,
+      "areaZ": 5,
+      "rx": 6,
+      "ry": 6,
+      "rz": 6,
+      "//results": 1,
+      "SumElemStressesToNodeForces": 1,
+      "stress_xx": 2,
+      "stress_yy": 2,
+      "stress_zz": 2,
+      "CalcElemVolumeDerivative": 1,
+      "VoluDer": 9,
+      "n0": 13,
+      "n5": 13,
+      "ox": 1,
+      "oy": 1,
+      "oz": 1,
+      "ox/12.0": 1,
+      "oy/12.0": 1,
+      "oz/12.0": 1,
+      "dvdx": 10,
+      "dvdy": 10,
+      "dvdz": 10,
+      "CalcElemFBHourglassForce": 1,
+      "hourgam": 7,
+      "coefficient": 4,
+      "hgfx": 2,
+      "hgfy": 2,
+      "hgfz": 2,
+      "hx": 3,
+      "hy": 3,
+      "hz": 3,
+      "shx": 3,
+      "shy": 3,
+      "shz": 3,
+      "CalcElemCharacteristicLength": 2,
+      "AreaFace": 7,
+      "p0": 7,
+      "p1": 7,
+      "p2": 7,
+      "p3": 7,
+      "gx": 5,
+      "gy": 5,
+      "gz": 5,
+      "area": 2,
+      "charLength": 2,
+      "sqrt": 10,
+      "CalcElemVelocityGradient": 2,
+      "xvel": 25,
+      "yvel": 25,
+      "zvel": 25,
+      "detJ": 5,
+      "d": 12,
+      "inv_detJ": 10,
+      "dyddx": 2,
+      "dxddy": 2,
+      "dzddx": 2,
+      "dxddz": 2,
+      "dzddy": 2,
+      "dyddz": 2,
+      "CalcPressureForElems": 4,
+      "p_new": 17,
+      "bvc": 15,
+      "pbvc": 14,
+      "e_old": 7,
+      "compression": 10,
+      "vnewc": 22,
+      "c1s": 3,
+      "abs": 8,
+      "//impossible": 1,
+      "targetdt": 1,
+      "//don": 1,
+      "t": 3,
+      "have": 2,
+      "negative": 2,
+      "determ": 1,
+      "can": 2,
+      "we": 1,
+      "be": 2,
+      "able": 1,
+      "write": 1,
+      "these": 1,
+      "as": 1,
+      "follows": 1,
+      "CalcVelocityForNodes": 1,
+      "local": 8,
+      "xdtmp": 4,
+      "ydtmp": 4,
+      "zdtmp": 4,
+      "CalcPositionForNodes": 1,
+      "ijk": 7,
+      "CalcLagrangeElements": 1,
+      "dxx": 6,
+      "dyy": 6,
+      "dzz": 6,
+      "CalcKinematicsForElems": 2,
+      "k": 20,
+      "vdovthird": 4,
+      "<=>": 7,
+      "0": 5,
+      "all": 1,
+      "elements": 3,
+      "8": 4,
+      "6": 1,
+      "nodal": 2,
+      "from": 2,
+      "global": 3,
+      "copy": 2,
+      "into": 3,
+      "xd_local": 4,
+      "yd_local": 4,
+      "zd_local": 4,
+      "dt2": 4,
+      "5": 1,
+      "wish": 1,
+      "this": 2,
+      "was": 1,
+      "too": 1,
+      "calculations": 1,
+      "relativeVolume": 3,
+      "1": 2,
+      "put": 1,
+      "velocity": 3,
+      "gradient": 3,
+      "quantities": 1,
+      "their": 1,
+      "2": 1,
+      "3": 1,
+      "sungeun": 1,
+      "Temporary": 1,
+      "array": 4,
+      "reused": 1,
+      "throughout": 1,
+      "delv_xi": 12,
+      "delv_eta": 12,
+      "delv_zeta": 12,
+      "position": 1,
+      "delx_xi": 7,
+      "delx_eta": 7,
+      "delx_zeta": 7,
+      "CalcQForElems": 1,
+      "MONOTONIC": 1,
+      "Q": 1,
+      "option": 1,
+      "Calculate": 1,
+      "gradients": 2,
+      "CalcMonotonicQGradientsForElems": 2,
+      "Transfer": 2,
+      "veloctiy": 1,
+      "first": 1,
+      "order": 1,
+      "problem": 1,
+      "commElements": 1,
+      "CommElements": 1,
+      "monoQ": 1,
+      "*/": 1,
+      "CalcMonotonicQForElems": 2,
+      "ApplyMaterialPropertiesForElems": 1,
+      "matelm": 2,
+      "vc": 6,
+      "exit": 1,
+      "EvalEOSForElems": 2,
+      "UpdateVolumesForElems": 1,
+      "tmpV": 4,
+      "ptiny": 9,
+      "xl": 26,
+      "yl": 26,
+      "zl": 26,
+      "xvl": 26,
+      "yvl": 26,
+      "zvl": 26,
+      "vol": 5,
+      "norm": 20,
+      "ax": 7,
+      "ay": 7,
+      "az": 7,
+      "dxv": 4,
+      "dyv": 4,
+      "dzv": 4,
+      "dxj": 1,
+      "dyj": 1,
+      "dzj": 1,
+      "dxi": 1,
+      "dyi": 1,
+      "dzi": 1,
+      "dxk": 1,
+      "dyk": 1,
+      "dzk": 1,
+      "dyi*dzj": 1,
+      "dzi*dyj": 1,
+      "dzi*dxj": 1,
+      "dxi*dzj": 1,
+      "dxi*dyj": 1,
+      "dyi*dxj": 1,
+      "ax*ax": 3,
+      "ay*ay": 3,
+      "az*az": 3,
+      "ax*dxv": 3,
+      "ay*dyv": 3,
+      "az*dzv": 3,
+      "dyj*dzk": 1,
+      "dzj*dyk": 1,
+      "dzj*dxk": 1,
+      "dxj*dzk": 1,
+      "dxj*dyk": 1,
+      "dyj*dxk": 1,
+      "dyk*dzi": 1,
+      "dzk*dyi": 1,
+      "dzk*dxi": 1,
+      "dxk*dzi": 1,
+      "dxk*dyi": 1,
+      "dyk*dxi": 1,
+      "//got": 1,
+      "rid": 1,
+      "of": 3,
+      "call": 1,
+      "through": 1,
+      "bcMask": 7,
+      "delvm": 27,
+      "delvp": 27,
+      "select": 6,
+      "when": 18,
+      "phixi": 10,
+      "phieta": 10,
+      "phizeta": 10,
+      "qlin": 4,
+      "qquad": 4,
+      "delvxxi": 4,
+      "delvxeta": 4,
+      "delvxzeta": 4,
+      "rho": 3,
+      "delvxxi**2": 1,
+      "phixi**2": 1,
+      "delvxeta**2": 1,
+      "phieta**2": 1,
+      "delvxzeta**2": 1,
+      "phizeta**2": 1,
+      "rho0": 8,
+      "delvc": 11,
+      "p_old": 8,
+      "q_old": 7,
+      "compHalfStep": 8,
+      "qq_old": 7,
+      "ql_old": 7,
+      "work": 5,
+      "e_new": 25,
+      "q_new": 11,
+      "vchalf": 2,
+      "CalcEnergyForElems": 2,
+      "CalcSoundSpeedForElems": 2,
+      "sixth": 2,
+      "pHalfStep": 5,
+      "vhalf": 1,
+      "ssc": 18,
+      "vhalf**2": 1,
+      "q_tilde": 4,
+      "**2": 6,
+      "enewc": 2,
+      "pnewc": 2,
+      "ssTmp": 4,
+      "elemToNodesTuple": 1,
+      "title": 2,
+      "string": 1,
+      "pi": 1,
+      "solarMass": 7,
+      "pi**2": 1,
+      "daysPerYear": 13,
+      "record": 1,
+      "body": 6,
+      "pos": 5,
+      "does": 1,
+      "not": 1,
+      "after": 1,
+      "it": 1,
+      "is": 1,
+      "up": 1,
+      "bodies": 8,
+      "numbodies": 5,
+      "bodies.numElements": 1,
+      "initSun": 2,
+      "writef": 2,
+      "advance": 2,
+      "b.v": 2,
+      "b.mass": 1,
+      ".v": 1,
+      "updateVelocities": 2,
+      "b1": 2,
+      "b2": 2,
+      "dpos": 4,
+      "b1.pos": 2,
+      "b2.pos": 2,
+      "mag": 3,
+      "sumOfSquares": 4,
+      "**3": 1,
+      "b1.v": 2,
+      "b2.mass": 2,
+      "b2.v": 1,
+      "b1.mass": 3,
+      "b.pos": 1,
+      "Random": 1,
+      "random": 1,
+      "number": 1,
+      "generation": 1,
+      "Timer": 2,
+      "class": 1,
+      "timer": 2,
+      "sort": 1,
+      "**15": 1,
+      "size": 1,
+      "sorted": 1,
+      "thresh": 6,
+      "recursive": 1,
+      "depth": 1,
+      "serialize": 1,
+      "verbose": 7,
+      "out": 1,
+      "many": 1,
+      "bool": 1,
+      "disable": 1,
+      "numbers": 1,
+      "fillRandom": 1,
+      "timer.start": 1,
+      "pqsort": 4,
+      "timer.stop": 1,
+      "timer.elapsed": 1,
+      "arr": 32,
+      "low": 12,
+      "arr.domain.low": 1,
+      "high": 14,
+      "arr.domain.high": 1,
+      "where": 2,
+      "arr.rank": 2,
+      "bubbleSort": 2,
+      "pivotVal": 9,
+      "findPivot": 2,
+      "pivotLoc": 3,
+      "partition": 2,
+      "serial": 1,
+      "cobegin": 1,
+      "mid": 7,
+      "ilo": 9,
+      "ihi": 6,
+      "low..high": 2
+    },
     "Cirru": {
       "print": 38,
       "array": 14,
@@ -70803,6 +71598,7 @@
     "COBOL": 90,
     "CSS": 43867,
     "Ceylon": 50,
+    "Chapel": 9607,
     "Cirru": 244,
     "Clojure": 510,
     "CoffeeScript": 2951,
@@ -71003,6 +71799,7 @@
     "COBOL": 4,
     "CSS": 2,
     "Ceylon": 1,
+    "Chapel": 5,
     "Cirru": 9,
     "Clojure": 7,
     "CoffeeScript": 9,
@@ -71179,5 +71976,5 @@
     "fish": 3,
     "wisp": 1
   },
-  "md5": "b82e97a2c9c123d8e370eeffe64a1a1d"
+  "md5": "4c7029e118ceca989e36b7f3c025a771"
 }
\ No newline at end of file
diff --git a/samples/Chapel/distributions.chpl b/samples/Chapel/distributions.chpl
new file mode 100644
index 00000000..a3e5f81d
--- /dev/null
+++ b/samples/Chapel/distributions.chpl
@@ -0,0 +1,304 @@
+//
+// Distributions Primer
+//
+// This primer demonstrates uses of some of Chapel's standard
+// distributions.  To use these distributions in a Chapel program,
+// the respective module must be used:
+//
+use BlockDist, CyclicDist, BlockCycDist, ReplicatedDist;
+use DimensionalDist2D, ReplicatedDim, BlockCycDim;
+
+//
+// For each distribution, we'll create a distributed domain and array
+// and then initialize it just to give a brief flavor of how the
+// distribution maps across locales.  Running this example on 6
+// locales does a nice job of illustrating the distribution
+// characteristics.
+//
+// All of these distributions support options to map to a different
+// virtual locale grid than the one used by default (a
+// multidimensional factoring of the built-in Locales array), as well
+// as to control the amount of parallelism used in data parallel
+// loops.  See the Standard Distributions chapter of the language spec
+// for more details.
+//
+
+//
+// Make the program size configurable from the command line.
+//
+config const n = 8;
+
+//
+// Declare a 2-dimensional domain Space that we will later use to
+// initialize the distributed domains.
+//
+const Space = {1..n, 1..n};
+
+//
+// The Block distribution distributes a bounding box from
+// n-dimensional space across the target locale array viewed as an
+// n-dimensional virtual locale grid.  The bounding box is blocked
+// into roughly equal portions across the locales.  Note that domains
+// declared over a Block distribution can also store indices outside
+// of the bounding box; the bounding box is merely used to compute
+// the blocking of space.
+//
+// In this example, we declare a 2-dimensional Block-distributed
+// domain BlockSpace and a Block-distributed array BA declared over
+// the domain.
+//
+const BlockSpace = Space dmapped Block(boundingBox=Space);
+var BA: [BlockSpace] int;
+
+//
+// To illustrate how the index set is distributed across locales,
+// we'll use a forall loop to initialize each array element to the
+// locale ID that stores that index/element/iteration.
+//
+forall ba in BA do
+  ba = here.id;
+
+//
+// Output the Block-distributed array to visually see how the elements
+// are partitioned across the locales.
+//
+writeln("Block Array Index Map");
+writeln(BA);
+writeln();
+
+//
+// Most of Chapel's standard distributions support an optional
+// targetLocales argument that permits you to pass in your own
+// array of locales to be targeted.  In general, the targetLocales
+// argument should match the rank of the distribution.  So for
+// example, to map a Block to a [numLocales x 1] view of the
+// locale set, one could do something like this:
+
+//
+// We start by creating our own array of the locale values.  Here
+// we use the standard array reshape function for convenience,
+// but more generally, this array could be accessed/assigned like any
+// other.
+//
+
+var MyLocaleView = {0..#numLocales, 1..1};
+var MyLocales: [MyLocaleView] locale = reshape(Locales, MyLocaleView);
+
+//
+// Then we'll declare a distributed domain/array that targets
+// this view of the locales:
+// 
+
+const BlockSpace2 = Space dmapped Block(boundingBox=Space,
+                                        targetLocales=MyLocales);
+var BA2: [BlockSpace2] int;
+
+//
+// Then we'll do a similar computation as before to verify where
+// everything ended up:
+//
+forall ba in BA2 do
+  ba = here.id;
+
+writeln("Block Array Index Map");
+writeln(BA2);
+writeln();
+
+
+
+//
+// Next, we'll perform a similar computation for the Cyclic distribution.
+// Cyclic distributions start at a designated n-dimensional index and
+// distribute the n-dimensional space across an n-dimensional array
+// of locales in a round-robin fashion (in each dimension).  As with
+// the Block distribution, domains may be declared using the
+// distribution who have lower indices that the starting index; that
+// value should just be considered a parameterization of how the
+// distribution is defined.
+//
+const CyclicSpace = Space dmapped Cyclic(startIdx=Space.low);
+var CA: [CyclicSpace] int;
+
+forall ca in CA do
+  ca = here.id;
+
+writeln("Cyclic Array Index Map");
+writeln(CA);
+writeln();
+
+
+//
+// Next, we'll declare a Block-Cyclic distribution.  These
+// distributions also deal out indices in a round-robin fashion,
+// but rather than dealing out singleton indices, they deal out blocks
+// of indices.  Thus, the BlockCyclic distribution is parameterized
+// by a starting index (as with Cyclic) and a block size (per
+// dimension) specifying how large the chunks to be dealt out are.
+// 
+const BlkCycSpace = Space dmapped BlockCyclic(startIdx=Space.low, 
+                                              blocksize=(2, 3));
+var BCA: [BlkCycSpace] int;
+
+forall bca in BCA do
+  bca = here.id;
+
+writeln("Block-Cyclic Array Index Map");
+writeln(BCA);
+writeln();
+
+
+//
+// The ReplicatedDist distribution is different: each of the
+// original domain's indices - and the corresponding array elements -
+// is replicated onto each locale. (Note: consistency among these
+// array replicands is NOT maintained automatically.)
+//
+// This replication is observable in some cases but not others,
+// as shown below. Note: this behavior may change in the future.
+//
+const ReplicatedSpace = Space dmapped ReplicatedDist();
+var RA: [ReplicatedSpace] int;
+
+// The replication is observable - this visits each replicand.
+forall ra in RA do
+  ra = here.id;
+
+writeln("Replicated Array Index Map, ", RA.numElements, " elements total");
+writeln(RA);
+writeln();
+
+//
+// The replication is observable when the replicated array is
+// on the left-hand side. If the right-hand side is not replicated,
+// it is copied into each replicand.
+// We illustrate this using a non-distributed array.
+//
+var A: [Space] int = [(i,j) in Space] i*100 + j;
+RA = A;
+writeln("Replicated Array after being array-assigned into");
+writeln(RA);
+writeln();
+
+//
+// Analogously, each replicand will be visited and
+// other participated expressions will be computed on each locale
+// (a) when the replicated array is assigned a scalar:
+//       RA = 5;
+// (b) when it appears first in a zippered forall loop:
+//       forall (ra, a) in zip(RA, A) do ...;
+// (c) when it appears in a for loop:
+//       for ra in RA do ...;
+//
+// Zippering (RA,A) or (A,RA) in a 'for' loop will generate
+// an error due to their different number of elements.
+
+// Let RA store the Index Map again, for the examples below.
+forall ra in RA do
+  ra = here.id;
+
+//
+// Only the local replicand is accessed - replication is NOT observable
+// and consistency is NOT maintained - when:
+// (a) the replicated array is indexed - an individual element is read...
+//
+on Locales(0) do
+  writeln("on ", here, ": ", RA(Space.low));
+on Locales(LocaleSpace.high) do
+  writeln("on ", here, ": ", RA(Space.low));
+writeln();
+
+// ...or an individual element is written;
+on Locales(LocaleSpace.high) do
+  RA(Space.low) = 7777;
+
+writeln("Replicated Array after being indexed into");
+writeln(RA);
+writeln();
+
+//
+// (b) the replicated array is on the right-hand side of an assignment...
+//
+on Locales(LocaleSpace.high) do
+  A = RA + 4;
+writeln("Non-Replicated Array after assignment from Replicated Array + 4");
+writeln(A);
+writeln();
+
+//
+// (c) ...or, generally, the replicated array or domain participates
+//     in a zippered forall loop, but not in the first position.
+//     The loop could look like:
+//
+//       forall (a, (i,j), ra) in (A, ReplicatedSpace, RA) do ...;
+//
+
+
+//
+// The DimensionalDist2D distribution lets us build a 2D distribution
+// as a composition of specifiers for individual dimensions.
+// Under such a "dimensional" distribution each dimension is handled
+// independently of the other.
+//
+// The dimension specifiers are similar to the corresponding multi-dimensional
+// distributions in constructor arguments and index-to-locale mapping rules.
+// However, instead of an array of locales, a specifier constructor
+// accepts just the number of locales that the indices in the corresponding
+// dimension will be distributed across.
+//
+// The DimensionalDist2D constructor requires:
+// * an [0..nl1-1, 0..nl2-1] array of locales, where
+//   nl1 and nl2 are the number of locales in each dimension, and
+// * two dimension specifiers, created for nl1 and nl2 locale counts, resp.
+//
+// Presently, the following dimension specifiers are available
+// (shown here with their constructor arguments):
+//
+// * ReplicatedDim(numLocales)
+// * BlockDim(numLocales, boundingBoxLow, boundingBoxHigh)
+// * BlockCyclicDim(lowIdx, blockSize, numLocales)
+//
+
+//
+// The following example creates a dimensional distribution that
+// replicates over 2 locales (when available) in the first dimemsion
+// and distributes using block-cyclic distribution in the second dimension.
+// The example computes nl1 and nl2 and reshapes MyLocales correspondingly.
+//
+
+var (nl1, nl2) = if numLocales == 1 then (1, 1) else (2, numLocales/2);
+MyLocaleView = {0..#nl1, 0..#nl2};
+MyLocales = reshape(Locales[0..#nl1*nl2], MyLocaleView);
+
+const DimReplicatedBlockcyclicSpace = Space
+  dmapped DimensionalDist2D(MyLocales,
+                            new ReplicatedDim(numLocales = nl1),
+                            new BlockCyclicDim(numLocales = nl2,
+                                               lowIdx = 1, blockSize = 2));
+
+var DRBA: [DimReplicatedBlockcyclicSpace] int;
+
+// The ReplicatedDim specifier always accesses the local replicand.
+// (This differs from how the ReplicatedDist distribution works.)
+//
+// This example visits each replicand. The behavior is the same
+// regardless of the second index into MyLocales below.
+
+for locId1 in 0..#nl1 do on MyLocales[locId1, 0] {
+
+  forall drba in DRBA do
+    drba = here.id;
+
+  writeln("Dimensional2D(Replicated,BlockCyclic) Array Index Map",
+          " from ", here);
+
+  // Technicality: 'writeln(DRBA)' would read DRBA always on Locale 0.
+  // Since we want to see what DRBA contains on the current locale,
+  // we use 'Helper' that is mapped using the default distribution.
+  // 'Helper = DRBA' captures the view of DRBA on the current locale,
+  // which we then print out.
+
+  const Helper: [Space] int = DRBA;
+  writeln(Helper);
+  writeln();
+
+}
diff --git a/samples/Chapel/hello.chpl b/samples/Chapel/hello.chpl
new file mode 100644
index 00000000..cc98e786
--- /dev/null
+++ b/samples/Chapel/hello.chpl
@@ -0,0 +1 @@
+writeln("Hello, world!");    // print 'Hello, world!' to the console
diff --git a/samples/Chapel/lulesh.chpl b/samples/Chapel/lulesh.chpl
new file mode 100644
index 00000000..d6480d26
--- /dev/null
+++ b/samples/Chapel/lulesh.chpl
@@ -0,0 +1,1692 @@
+/*
+  Derived from the DARPA/Livermore Unstructured Lagrangian Explicit
+  Shock Hydrodynamics (LULESH)
+  https://computation.llnl.gov/casc/ShockHydro/
+
+  Original port to Chapel by Brandon Holt (8/2011).  Further
+  improvements for the sake of performance and/or generality made by
+  Sung-Eun Choi (12/2011, 11/2012), Jeff Keasler (3/2012), and Brad
+  Chamberlain (3-4,9-11/2012, 2/2013).
+
+
+  Notes on the Initial Implementation
+  -----------------------------------
+   
+  This implementation was designed to mirror the overall structure of
+  the C++ Lulesh but use Chapel constructs where they can help make
+  the code more readable, easier to maintain, or more
+  'elegant'. Function names are preserved for the most part, with some
+  additional helper functions, and original comments from the C++ code
+  are interspersed approximately where they belong to give an idea of
+  how the two codes line up. One major difference for this Chapel
+  version is the use of a number of module-level variables and
+  constants.
+
+
+  Status:
+
+  This code remains a work-in-progress as we gain further experience
+  with it.  Proposed improvements are noted in the README in this
+  directory and (in some cases) in TODO comments in the code.
+
+ */
+
+
+
+use Time,       // to get timing routines for benchmarking
+    BlockDist;  // for block-distributed arrays
+
+use luleshInit;   // initialization code for data set
+
+/* The configuration parameters for lulesh.  These can be set on the
+   compiler command line using -s<paramName>=<value>.  For example,
+
+     chpl -suseBlockDist=true
+
+   useBlockDist : says whether or not to block-distribute the arrays.
+                  The default is based on the value of CHPL_COMM, as
+                  an indication of whether this is a single- or multi-
+                  locale execution.
+
+   use3DRepresentation : indicates whether the element node arrays
+                         should be stored using a 3D representation
+                         (limiting the execution to cube inputs) or
+                         the more general 1D representation (supporting
+                         arbitrary data sets).
+
+   useSparseMaterials : indicates whether sparse domains/arrays should be
+                        used to represent the materials.  Sparse domains
+                        are more realistic in that they permit an arbitrary
+                        subset of the problem space to store a material.
+                        Dense domains are sufficient for LULESH since there's
+                        an assumption that the material spans all cells.
+
+   printWarnings : prints performance-oriented warnings to prevent
+                   surprises.
+*/
+   
+config param useBlockDist = (CHPL_COMM != "none"),
+             use3DRepresentation = false,
+             useSparseMaterials = true,
+             printWarnings = true;
+
+
+//
+// Sanity check to ensure that input files aren't used with the 3D
+// representation
+//
+if (use3DRepresentation && (luleshInit.filename != "")) then
+  halt("The 3D representation does not support reading input from files");
+
+
+/* Configuration constants: Override defaults on executable's command-line */
+
+config const initialEnergy = 3.948746e+7;            // initial energy value
+
+
+config const showProgress = false,   // print time and dt values on each step
+             debug = false,          // print various debug info
+             doTiming = true,        // time the main timestep loop
+             printCoords = true;     // print the final computed coordinates
+
+
+/* Compile-time constants */
+
+param XI_M        = 0x003,
+      XI_M_SYMM   = 0x001,
+      XI_M_FREE   = 0x002,
+
+      XI_P        = 0x00c,
+      XI_P_SYMM   = 0x004,
+      XI_P_FREE   = 0x008,
+
+      ETA_M       = 0x030,
+      ETA_M_SYMM  = 0x010,
+      ETA_M_FREE  = 0x020,
+
+      ETA_P       = 0x0c0,
+      ETA_P_SYMM  = 0x040,
+      ETA_P_FREE  = 0x080,
+
+      ZETA_M      = 0x300,
+      ZETA_M_SYMM = 0x100,
+      ZETA_M_FREE = 0x200,
+
+      ZETA_P      = 0xc00,
+      ZETA_P_SYMM = 0x400,
+      ZETA_P_FREE = 0x800;
+
+
+/* Set up the problem size */
+
+const (numElems, numNodes) = initProblemSize();
+
+
+/* Declare abstract problem domains */
+
+const ElemSpace = if use3DRepresentation
+                    then {0..#elemsPerEdge, 0..#elemsPerEdge, 0..#elemsPerEdge}
+                    else {0..#numElems},
+      NodeSpace = if use3DRepresentation
+                    then {0..#nodesPerEdge, 0..#nodesPerEdge, 0..#nodesPerEdge}
+                    else {0..#numNodes};
+
+
+/* Declare the (potentially distributed) problem domains */
+
+const Elems = if useBlockDist then ElemSpace dmapped Block(ElemSpace)
+                              else ElemSpace,
+      Nodes = if useBlockDist then NodeSpace dmapped Block(NodeSpace)
+                              else NodeSpace;
+
+
+/* The coordinates */
+
+var x, y, z: [Nodes] real;
+                              
+
+/* The number of nodes per element.  In a rank-independent version,
+   this could be written 2**rank */
+
+param nodesPerElem = 8;
+
+                                 
+// We could name this, but chose not to since it doesn't add that much clarity
+//
+// const elemNeighbors = 1..nodesPerElem;
+
+
+/* The element-to-node mapping */
+
+var elemToNode: [Elems] nodesPerElem*index(Nodes);
+
+
+/* the Greek variables */
+
+var lxim, lxip, letam, letap, lzetam, lzetap: [Elems] index(Elems);
+
+
+/* the X, Y, Z Symmetry values */
+
+var XSym, YSym, ZSym: sparse subdomain(Nodes);
+
+
+
+/* Constants */
+
+const u_cut = 1.0e-7,           /* velocity tolerance */
+      hgcoef = 3.0,             /* hourglass control */
+      qstop = 1.0e+12,          /* excessive q indicator */
+      monoq_max_slope = 1.0,
+      monoq_limiter_mult = 2.0,
+      e_cut = 1.0e-7,           /* energy tolerance */
+      p_cut = 1.0e-7,           /* pressure tolerance */
+      ss4o3 = 4.0/3.0,
+      q_cut = 1.0e-7,           /* q tolerance */
+      v_cut = 1.0e-10,          /* relative volume tolerance */
+      qlc_monoq = 0.5,          /* linear term coef for q */
+      qqc_monoq = 2.0/3.0,      /* quadratic term coef for q */
+      qqc = 2.0, 
+      qqc2 = 64.0 * qqc**2,
+      eosvmax = 1.0e+9,
+      eosvmin = 1.0e-9,
+      pmin = 0.0,               /* pressure floor */
+      emin = -1.0e+15,          /* energy floor */
+      dvovmax = 0.1,            /* maximum allowable volume change */
+      refdens = 1.0,            /* reference density */
+
+      deltatimemultlb = 1.1,
+      deltatimemultub = 1.2,
+      dtmax = 1.0e-2;           /* maximum allowable time increment */
+
+                              
+config const stoptime = 1.0e-2,      /* end time for simulation */
+             maxcycles = max(int),   /* max number of cycles to simulate */
+             dtfixed = -1.0e-7;      /* fixed time increment */
+
+
+/* The list of material elements */
+
+const MatElems: MatElemsType = if useSparseMaterials then enumerateMatElems()
+                                                     else Elems;
+
+
+proc MatElemsType type {
+  if useSparseMaterials {
+    if (printWarnings && useBlockDist && numLocales > 1) then
+      writeln("WARNING: The LULESH Material Elements (MatElems) are not yet\n",
+              "         distributed, so result in excessive memory use on,\n",
+              "         and communication with, locale 0\n");
+    return sparse subdomain(Elems);
+  } else
+    return Elems.type;
+}
+
+iter enumerateMatElems() {
+  if (printWarnings && useBlockDist && numLocales > 1) then
+    writeln("WARNING: generation of matrix elements is serial and\n",
+            "         unlikely to scale");
+  for i in Elems do
+    yield i;
+}
+
+
+/* Element fields */
+
+var elemBC: [Elems] int,
+
+    e: [Elems] real, // energy
+    p: [Elems] real, // pressure
+
+    q: [Elems] real, // q
+    ql: [Elems] real, // linear term for q
+    qq: [Elems] real, // quadratic term for q
+
+    v:    [Elems] real = 1.0, //relative volume
+    vnew: [Elems] real,
+
+    volo: [Elems] real, // reference volume
+    delv: [Elems] real, // m_vnew - m_v
+    vdov: [Elems] real, // volume derivative over volume
+
+    arealg: [Elems] real, // elem characteristic length
+
+    ss: [Elems] real, // "sound speed"
+
+    elemMass: [Elems] real; // mass
+
+
+/* Nodal fields */
+
+var xd, yd, zd: [Nodes] real, // velocities
+
+    xdd, ydd, zdd: [Nodes] real, // acceleration
+
+    fx, fy, fz: [Nodes] atomic real, // forces
+
+    nodalMass: [Nodes] real; // mass
+
+
+/* Parameters */
+
+var time = 0.0,          // current time
+    deltatime = 1.0e-7,  // variable time increment
+    dtcourant = 1.0e20,  // courant constraint
+    dthydro = 1.0e20,    // volume change constraint
+
+    cycle = 0;           // iteration count for simulation
+
+
+proc main() {
+  if debug then writeln("Lulesh -- Problem Size = ", numElems);
+
+  initLulesh();
+
+  var st: real;
+  if doTiming then st = getCurrentTime();
+  while (time < stoptime && cycle < maxcycles) {
+    const iterTime = if showProgress then getCurrentTime() else 0.0;
+
+    TimeIncrement();
+
+    LagrangeLeapFrog();
+
+    if debug {
+      //      deprint("[[ Forces ]]", fx, fy, fz);
+      deprint("[[ Positions ]]", x, y, z);
+      deprint("[[ p, e, q ]]", p, e, q);
+    }
+    if showProgress then
+      writeln("time = ", format("%e", time), ", dt=", format("%e", deltatime),
+              if doTiming then ", elapsed = " + (getCurrentTime()-iterTime) 
+                          else "");
+  }
+  if (cycle == maxcycles) {
+    writeln("Stopped early due to reaching maxnumsteps");
+  }
+  if doTiming {
+    const et = getCurrentTime();
+    writeln("Total Time: ", et-st);
+    writeln("Time/Cycle: ", (et-st)/cycle);
+  }
+  writeln("Number of cycles: ", cycle);
+
+  if printCoords {
+    var outfile = open("coords.out", iomode.cw);
+    var writer = outfile.writer();
+    var fmtstr = if debug then "%1.9e" else "%1.4e";
+    for i in Nodes {
+      writer.writeln(format(fmtstr, x[i]), " ", 
+                     format(fmtstr, y[i]), " ", 
+                     format(fmtstr, z[i]));
+    }
+    writer.close();
+    outfile.close();
+  }
+}
+
+
+/* Initialization functions */
+
+proc initLulesh() {
+  // initialize the coordinates
+  initCoordinates(x,y,z);
+
+  // initialize the element to node mapping
+  initElemToNodeMapping(elemToNode);
+
+  // initialize the greek symbols
+  initGreekVars(lxim, lxip, letam, letap, lzetam, lzetap);
+
+  // initialize the symmetry plane locations
+  initXSyms(XSym);
+  initYSyms(YSym);
+  initZSyms(ZSym);
+
+  /* embed hexehedral elements in nodal point lattice */
+  //calculated on the fly using: elemToNodes(i: index(Elems)): index(Nodes)
+
+  // initialize the masses
+  initMasses();
+
+  // initialize the boundary conditions
+  const octantCorner = initBoundaryConditions();
+
+  // deposit the energy for Sedov Problem
+  e[octantCorner] = initialEnergy;
+}
+
+
+proc initMasses() {
+  // This is a temporary array used to accumulate masses in parallel
+  // without losing updates by using 'atomic' variables
+  var massAccum: [Nodes] atomic real;
+
+  forall eli in Elems {
+    var x_local, y_local, z_local: 8*real;
+    localizeNeighborNodes(eli, x, x_local, y, y_local, z, z_local);
+
+    const volume = CalcElemVolume(x_local, y_local, z_local);
+    volo[eli] = volume;
+    elemMass[eli] = volume;
+
+    for neighbor in elemToNodes[eli] do
+      massAccum[neighbor].add(volume);
+  }
+
+  // When we're done, copy the accumulated masses into nodalMass, at
+  // which point the massAccum array can go away (and will at the
+  // procedure's return
+
+  forall i in Nodes do
+    nodalMass[i] = massAccum[i].read() / 8.0;
+
+  if debug {
+    writeln("ElemMass:");
+    for mass in elemMass do writeln(mass);
+
+    writeln("NodalMass:");
+    for mass in nodalMass do writeln(mass);
+  }
+}
+
+
+proc initBoundaryConditions() {
+  var surfaceNode: [Nodes] int;
+
+  forall n in XSym do
+    surfaceNode[n] = 1;
+  forall n in YSym do
+    surfaceNode[n] = 1;
+  forall n in ZSym do
+    surfaceNode[n] = 1;
+
+  forall e in Elems do {
+    var mask: int;
+    for i in 1..nodesPerElem do
+      mask += surfaceNode[elemToNode[e][i]] << (i-1);
+
+    // TODO: make an inlined function for this little idiom? (and below)
+
+    if ((mask & 0x0f) == 0x0f) then elemBC[e] |= ZETA_M_SYMM;
+    if ((mask & 0xf0) == 0xf0) then elemBC[e] |= ZETA_P_SYMM;
+    if ((mask & 0x33) == 0x33) then elemBC[e] |= ETA_M_SYMM;
+    if ((mask & 0xcc) == 0xcc) then elemBC[e] |= ETA_P_SYMM;
+    if ((mask & 0x99) == 0x99) then elemBC[e] |= XI_M_SYMM;
+    if ((mask & 0x66) == 0x66) then elemBC[e] |= XI_P_SYMM;
+  }
+
+
+  //
+  // We find the octant corner by looking for the element with
+  // all three SYMM flags set, which will have the largest
+  // integral value.  Thus, we can use a maxloc to identify it.
+  //
+  const (check, loc) = maxloc reduce zip(elemBC, Elems);
+
+  if debug then writeln("Found the octant corner at: ", loc);
+
+  if (check != (XI_M_SYMM | ETA_M_SYMM | ZETA_M_SYMM)) then
+    halt("maxloc got a value of ", check, " at loc ", loc);
+
+  // TODO: This is an example of an array that, in a distributed
+  // memory code, would typically be completely local and only storing
+  // the local nodes owned by the locale -- noting that some nodes
+  // are logically owned by multiple locales and therefore would 
+  // redundantly be stored in both locales' surfaceNode arrays -- it's
+  // essentially local scratchspace that does not need to be communicated
+  // or kept coherent across locales.
+  //
+
+  surfaceNode = 0;
+
+  /* the free surfaces */
+
+  var freeSurface: sparse subdomain(Nodes);
+
+  // initialize the free surface
+  initFreeSurface(freeSurface);
+
+  forall n in freeSurface do
+    surfaceNode[n] = 1;
+
+  forall e in Elems do {
+    var mask: int;
+    for i in 1..nodesPerElem do
+      mask += surfaceNode[elemToNode[e][i]] << (i-1);
+
+    if ((mask & 0x0f) == 0x0f) then elemBC[e] |= ZETA_M_FREE;
+    if ((mask & 0xf0) == 0xf0) then elemBC[e] |= ZETA_P_FREE;
+    if ((mask & 0x33) == 0x33) then elemBC[e] |= ETA_M_FREE;
+    if ((mask & 0xcc) == 0xcc) then elemBC[e] |= ETA_P_FREE;
+    if ((mask & 0x99) == 0x99) then elemBC[e] |= XI_M_FREE;
+    if ((mask & 0x66) == 0x66) then elemBC[e] |= XI_P_FREE;
+  }
+
+  if debug {
+    writeln("elemBC:");
+    for b in elemBC do writeln(b);
+  }
+
+  return loc;
+}
+
+
+/* Helper functions */
+
+inline proc localizeNeighborNodes(eli: index(Elems),
+                                  x: [] real, ref x_local: 8*real,
+                                  y: [] real, ref y_local: 8*real,
+                                  z: [] real, ref z_local: 8*real) {
+
+  for param i in 1..nodesPerElem {
+    const noi = elemToNode[eli][i];
+
+    x_local[i] = x[noi];
+    y_local[i] = y[noi];
+    z_local[i] = z[noi];
+  }
+}
+
+inline proc TripleProduct(x1, y1, z1, x2, y2, z2, x3, y3, z3) {
+  return x1*(y2*z3 - z2*y3) + x2*(z1*y3 - y1*z3) + x3*(y1*z2 - z1*y2);
+}
+
+
+proc CalcElemVolume(x, y, z) {
+  const dx61 = x[7] - x[2],
+        dy61 = y[7] - y[2],
+        dz61 = z[7] - z[2],
+
+        dx70 = x[8] - x[1],
+        dy70 = y[8] - y[1],
+        dz70 = z[8] - z[1],
+
+        dx63 = x[7] - x[4],
+        dy63 = y[7] - y[4],
+        dz63 = z[7] - z[4],
+
+        dx20 = x[3] - x[1],
+        dy20 = y[3] - y[1],
+        dz20 = z[3] - z[1],
+
+        dx50 = x[6] - x[1],
+        dy50 = y[6] - y[1],
+        dz50 = z[6] - z[1],
+
+        dx64 = x[7] - x[5],
+        dy64 = y[7] - y[5],
+        dz64 = z[7] - z[5],
+
+        dx31 = x[4] - x[2],
+        dy31 = y[4] - y[2],
+        dz31 = z[4] - z[2],
+
+        dx72 = x[8] - x[3],
+        dy72 = y[8] - y[3],
+        dz72 = z[8] - z[3],
+
+        dx43 = x[5] - x[4],
+        dy43 = y[5] - y[4],
+        dz43 = z[5] - z[4],
+
+        dx57 = x[6] - x[8],
+        dy57 = y[6] - y[8],
+        dz57 = z[6] - z[8],
+
+        dx14 = x[2] - x[5],
+        dy14 = y[2] - y[5],
+        dz14 = z[2] - z[5],
+
+        dx25 = x[3] - x[6],
+        dy25 = y[3] - y[6],
+        dz25 = z[3] - z[6];
+
+  const volume = TripleProduct(dx31 + dx72, dx63, dx20,
+                               dy31 + dy72, dy63, dy20,
+                               dz31 + dz72, dz63, dz20) +
+                 TripleProduct(dx43 + dx57, dx64, dx70,
+                               dy43 + dy57, dy64, dy70,
+                               dz43 + dz57, dz64, dz70) +
+                 TripleProduct(dx14 + dx25, dx61, dx50,
+                               dy14 + dy25, dy61, dy50,
+                               dz14 + dz25, dz61, dz50);
+
+  return volume / 12.0;
+}
+
+proc InitStressTermsForElems(p, q, sigxx, sigyy, sigzz: [?D] real) {
+  forall i in D {
+    sigxx[i] = -p[i] - q[i];
+    sigyy[i] = -p[i] - q[i];
+    sigzz[i] = -p[i] - q[i];
+  }
+}
+
+
+proc CalcElemShapeFunctionDerivatives(x: 8*real, y: 8*real, z: 8*real, 
+                                      ref b_x: 8*real,
+                                      ref b_y: 8*real,
+                                      ref b_z: 8*real, 
+                                      ref volume: real) {
+
+  const fjxxi = .125 * ((x[7]-x[1]) + (x[6]-x[4]) - (x[8]-x[2]) - (x[5]-x[3])),
+        fjxet = .125 * ((x[7]-x[1]) - (x[6]-x[4]) + (x[8]-x[2]) - (x[5]-x[3])),
+        fjxze = .125 * ((x[7]-x[1]) + (x[6]-x[4]) + (x[8]-x[2]) + (x[5]-x[3])),
+
+        fjyxi = .125 * ((y[7]-y[1]) + (y[6]-y[4]) - (y[8]-y[2]) - (y[5]-y[3])),
+        fjyet = .125 * ((y[7]-y[1]) - (y[6]-y[4]) + (y[8]-y[2]) - (y[5]-y[3])),
+        fjyze = .125 * ((y[7]-y[1]) + (y[6]-y[4]) + (y[8]-y[2]) + (y[5]-y[3])),
+
+        fjzxi = .125 * ((z[7]-z[1]) + (z[6]-z[4]) - (z[8]-z[2]) - (z[5]-z[3])),
+        fjzet = .125 * ((z[7]-z[1]) - (z[6]-z[4]) + (z[8]-z[2]) - (z[5]-z[3])),
+        fjzze = .125 * ((z[7]-z[1]) + (z[6]-z[4]) + (z[8]-z[2]) + (z[5]-z[3]));
+
+  /* compute cofactors */
+  const cjxxi =    (fjyet * fjzze) - (fjzet * fjyze),
+        cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze),
+        cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet),
+
+        cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze),
+        cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze),
+        cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet),
+
+        cjzxi =    (fjxet * fjyze) - (fjyet * fjxze),
+        cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze),
+        cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
+
+  /* calculate partials :
+     this need only be done for l = 0,1,2,3   since , by symmetry ,
+     (6,7,4,5) = - (0,1,2,3) .
+  */
+  b_x[1] =   -  cjxxi  -  cjxet  -  cjxze;
+  b_x[2] =      cjxxi  -  cjxet  -  cjxze;
+  b_x[3] =      cjxxi  +  cjxet  -  cjxze;
+  b_x[4] =   -  cjxxi  +  cjxet  -  cjxze;
+  b_x[5] = -b_x[3];
+  b_x[6] = -b_x[4];
+  b_x[7] = -b_x[1];
+  b_x[8] = -b_x[2];
+
+  b_y[1] =   -  cjyxi  -  cjyet  -  cjyze;
+  b_y[2] =      cjyxi  -  cjyet  -  cjyze;
+  b_y[3] =      cjyxi  +  cjyet  -  cjyze;
+  b_y[4] =   -  cjyxi  +  cjyet  -  cjyze;
+  b_y[5] = -b_y[3];
+  b_y[6] = -b_y[4];
+  b_y[7] = -b_y[1];
+  b_y[8] = -b_y[2];
+
+  b_z[1] =   -  cjzxi  -  cjzet  -  cjzze;
+  b_z[2] =      cjzxi  -  cjzet  -  cjzze;
+  b_z[3] =      cjzxi  +  cjzet  -  cjzze;
+  b_z[4] =   -  cjzxi  +  cjzet  -  cjzze;
+  b_z[5] = -b_z[3];
+  b_z[6] = -b_z[4];
+  b_z[7] = -b_z[1];
+  b_z[8] = -b_z[2];
+
+  /* calculate jacobian determinant (volume) */
+  volume = 8.0 * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
+}
+
+
+proc CalcElemNodeNormals(ref pfx: 8*real, ref pfy: 8*real, ref pfz: 8*real, 
+                         x: 8*real, y: 8*real, z: 8*real) {
+
+  proc ElemFaceNormal(param n1, param n2, param n3, param n4) {
+    const bisectX0 = 0.5 * (x[n4] + x[n3] - x[n2] - x[n1]),
+          bisectY0 = 0.5 * (y[n4] + y[n3] - y[n2] - y[n1]),
+          bisectZ0 = 0.5 * (z[n4] + z[n3] - z[n2] - z[n1]),
+          bisectX1 = 0.5 * (x[n3] + x[n2] - x[n4] - x[n1]),
+          bisectY1 = 0.5 * (y[n3] + y[n2] - y[n4] - y[n1]),
+          bisectZ1 = 0.5 * (z[n3] + z[n2] - z[n4] - z[n1]),
+          areaX    = 0.25 * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1),
+          areaY    = 0.25 * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1),
+          areaZ    = 0.25 * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
+
+    var rx, ry, rz: 8*real; //results
+
+    (rx[n1], rx[n2], rx[n3], rx[n4]) = (areaX, areaX, areaX, areaX);
+    (ry[n1], ry[n2], ry[n3], ry[n4]) = (areaY, areaY, areaY, areaY);
+    (rz[n1], rz[n2], rz[n3], rz[n4]) = (areaZ, areaZ, areaZ, areaZ);
+
+    return (rx, ry, rz);
+  }
+
+  // calculate total normal from each face (faces are made up of
+  // combinations of nodes)
+
+  (pfx, pfy, pfz) = ElemFaceNormal(1,2,3,4) + ElemFaceNormal(1,5,6,2) +
+                    ElemFaceNormal(2,6,7,3) + ElemFaceNormal(3,7,8,4) +
+                    ElemFaceNormal(4,8,5,1) + ElemFaceNormal(5,8,7,6);
+}
+
+
+proc SumElemStressesToNodeForces(b_x: 8*real, b_y: 8*real, b_z: 8*real, 
+                                 stress_xx:real,
+                                 stress_yy:real,
+                                 stress_zz: real, 
+                                 ref fx: 8*real,
+                                 ref fy: 8*real,
+                                 ref fz: 8*real) {
+  for param i in 1..8 {
+    fx[i] = -(stress_xx * b_x[i]);
+    fy[i] = -(stress_yy * b_y[i]);
+    fz[i] = -(stress_zz * b_z[i]);
+  }
+}
+
+proc CalcElemVolumeDerivative(x: 8*real, y: 8*real, z: 8*real) {
+
+  proc VoluDer(param n0, param n1, param n2, param n3, param n4, param n5) {
+    const ox =   (y[n1] + y[n2]) * (z[n0] + z[n1]) 
+               - (y[n0] + y[n1]) * (z[n1] + z[n2])
+               + (y[n0] + y[n4]) * (z[n3] + z[n4]) 
+               - (y[n3] + y[n4]) * (z[n0] + z[n4])
+               - (y[n2] + y[n5]) * (z[n3] + z[n5])
+               + (y[n3] + y[n5]) * (z[n2] + z[n5]),
+          oy = - (x[n1] + x[n2]) * (z[n0] + z[n1])
+               + (x[n0] + x[n1]) * (z[n1] + z[n2])
+               - (x[n0] + x[n4]) * (z[n3] + z[n4])
+               + (x[n3] + x[n4]) * (z[n0] + z[n4])
+               + (x[n2] + x[n5]) * (z[n3] + z[n5])
+               - (x[n3] + x[n5]) * (z[n2] + z[n5]),
+          oz = - (y[n1] + y[n2]) * (x[n0] + x[n1])
+               + (y[n0] + y[n1]) * (x[n1] + x[n2])
+               - (y[n0] + y[n4]) * (x[n3] + x[n4])
+               + (y[n3] + y[n4]) * (x[n0] + x[n4])
+               + (y[n2] + y[n5]) * (x[n3] + x[n5])
+               - (y[n3] + y[n5]) * (x[n2] + x[n5]);
+
+    return (ox/12.0, oy/12.0, oz/12.0);
+  }
+
+  var dvdx, dvdy, dvdz: 8*real;
+
+  (dvdx[1], dvdy[1], dvdz[1]) = VoluDer(2,3,4,5,6,8);
+  (dvdx[4], dvdy[4], dvdz[4]) = VoluDer(1,2,3,8,5,7);
+  (dvdx[3], dvdy[3], dvdz[3]) = VoluDer(4,1,2,7,8,6);
+  (dvdx[2], dvdy[2], dvdz[2]) = VoluDer(3,4,1,6,7,5);
+  (dvdx[5], dvdy[5], dvdz[5]) = VoluDer(8,7,6,1,4,2);
+  (dvdx[6], dvdy[6], dvdz[6]) = VoluDer(5,8,7,2,1,3);
+  (dvdx[7], dvdy[7], dvdz[7]) = VoluDer(6,5,8,3,2,4);
+  (dvdx[8], dvdy[8], dvdz[8]) = VoluDer(7,6,5,4,3,1);
+
+  return (dvdx, dvdy, dvdz);
+}
+
+inline proc CalcElemFBHourglassForce(xd: 8*real, yd: 8*real, zd: 8*real,
+                                     hourgam: 8*(4*real),
+                                     coefficient: real,
+                                     ref hgfx: 8*real,
+                                     ref hgfy: 8*real,
+                                     ref hgfz: 8*real) {
+  var hx, hy, hz: 4*real;
+
+  // reduction
+  for param i in 1..4 {
+    for param j in 1..8 {
+      hx[i] += hourgam[j][i] * xd[j];
+      hy[i] += hourgam[j][i] * yd[j];
+      hz[i] += hourgam[j][i] * zd[j];
+    }
+  }
+
+  for param i in 1..8 {
+    var shx, shy, shz: real;
+    for param j in 1..4 {
+      shx += hourgam[i][j] * hx[j];
+      shy += hourgam[i][j] * hy[j];
+      shz += hourgam[i][j] * hz[j];
+    }
+    hgfx[i] = coefficient * shx;
+    hgfy[i] = coefficient * shy;
+    hgfz[i] = coefficient * shz;
+  }
+}
+
+
+proc CalcElemCharacteristicLength(x, y, z, volume) {
+  proc AreaFace(param p0, param p1, param p2, param p3) {
+    const fx = (x[p2] - x[p0]) - (x[p3] - x[p1]),
+          fy = (y[p2] - y[p0]) - (y[p3] - y[p1]),
+          fz = (z[p2] - z[p0]) - (z[p3] - z[p1]),
+          gx = (x[p2] - x[p0]) + (x[p3] - x[p1]),
+          gy = (y[p2] - y[p0]) + (y[p3] - y[p1]),
+          gz = (z[p2] - z[p0]) + (z[p3] - z[p1]),
+          area = (fx * fx + fy * fy + fz * fz) *
+                 (gx * gx + gy * gy + gz * gz) -
+                 (fx * gx + fy * gy + fz * gz) *
+                 (fx * gx + fy * gy + fz * gz);
+
+    return area ;
+  }
+
+  const charLength = max(AreaFace(1, 2, 3, 4),
+                         AreaFace(5, 6, 7, 8),
+                         AreaFace(1, 2, 6, 5),
+                         AreaFace(2, 3, 7, 6),
+                         AreaFace(3, 4, 8, 7),
+                         AreaFace(4, 1, 5, 8));
+
+  return 4.0 * volume / sqrt(charLength);
+}
+
+
+proc CalcElemVelocityGradient(xvel, yvel, zvel, pfx,  pfy, pfz,
+                              detJ, ref d: 6*real) {
+  const inv_detJ = 1.0 / detJ;
+
+  d[1] = inv_detJ * ( pfx[1] * (xvel[1]-xvel[7])
+                    + pfx[2] * (xvel[2]-xvel[8])
+                    + pfx[3] * (xvel[3]-xvel[5])
+                    + pfx[4] * (xvel[4]-xvel[6]) );
+  d[2] = inv_detJ * ( pfy[1] * (yvel[1]-yvel[7])
+                    + pfy[2] * (yvel[2]-yvel[8])
+                    + pfy[3] * (yvel[3]-yvel[5])
+                    + pfy[4] * (yvel[4]-yvel[6]) );
+  d[3] = inv_detJ * ( pfz[1] * (zvel[1]-zvel[7])
+                    + pfz[2] * (zvel[2]-zvel[8])
+                    + pfz[3] * (zvel[3]-zvel[5])
+                    + pfz[4] * (zvel[4]-zvel[6]) );
+
+  const dyddx  = inv_detJ * ( pfx[1] * (yvel[1]-yvel[7])
+                            + pfx[2] * (yvel[2]-yvel[8])
+                            + pfx[3] * (yvel[3]-yvel[5])
+                            + pfx[4] * (yvel[4]-yvel[6]) ),
+
+        dxddy  = inv_detJ * ( pfy[1] * (xvel[1]-xvel[7])
+                            + pfy[2] * (xvel[2]-xvel[8])
+                            + pfy[3] * (xvel[3]-xvel[5])
+                            + pfy[4] * (xvel[4]-xvel[6]) ),
+
+        dzddx  = inv_detJ * ( pfx[1] * (zvel[1]-zvel[7])
+                            + pfx[2] * (zvel[2]-zvel[8])
+                            + pfx[3] * (zvel[3]-zvel[5])
+                            + pfx[4] * (zvel[4]-zvel[6]) ),
+
+        dxddz  = inv_detJ * ( pfz[1] * (xvel[1]-xvel[7])
+                            + pfz[2] * (xvel[2]-xvel[8])
+                            + pfz[3] * (xvel[3]-xvel[5])
+                            + pfz[4] * (xvel[4]-xvel[6]) ),
+
+        dzddy  = inv_detJ * ( pfy[1] * (zvel[1]-zvel[7])
+                            + pfy[2] * (zvel[2]-zvel[8])
+                            + pfy[3] * (zvel[3]-zvel[5])
+                            + pfy[4] * (zvel[4]-zvel[6]) ),
+
+        dyddz  = inv_detJ * ( pfz[1] * (yvel[1]-yvel[7])
+                            + pfz[2] * (yvel[2]-yvel[8])
+                            + pfz[3] * (yvel[3]-yvel[5])
+                            + pfz[4] * (yvel[4]-yvel[6]) );
+
+  d[6]  = 0.5 * ( dxddy + dyddx );
+  d[5]  = 0.5 * ( dxddz + dzddx );
+  d[4]  = 0.5 * ( dzddy + dyddz );
+}
+
+
+proc CalcPressureForElems(p_new: [?D] real, bvc, pbvc, 
+                          e_old, compression, vnewc,
+                          pmin: real, p_cut: real, eosvmax: real) {
+
+  //
+  // TODO: Uncomment local once sparse domain is distributed
+  //
+  forall i in D /* do local */ {
+    const c1s = 2.0 / 3.0;
+    bvc[i] = c1s * (compression[i] + 1.0);
+    pbvc[i] = c1s;
+  }
+
+  forall i in D {
+    p_new[i] = bvc[i] * e_old[i];
+
+    if abs(p_new[i]) < p_cut then p_new[i] = 0.0;
+    if vnewc[i] >= eosvmax then p_new[i] = 0.0; //impossible?
+    if p_new[i] < pmin then p_new[i] = pmin;
+  }
+}
+
+
+proc TimeIncrement() {
+  var targetdt = stoptime - time;
+
+  if dtfixed <= 0.0 && cycle != 0 { //don't do this the first cycle
+    var olddt = deltatime,
+        newdt = 1.0e20;
+
+    if dtcourant < newdt then newdt = dtcourant / 2.0;
+    if dthydro < newdt then   newdt = 2.0/3.0 * dthydro;
+
+    const ratio = newdt / olddt;
+    if ratio >= 1.0 {
+      if ratio < deltatimemultlb then      newdt = olddt;
+      else if ratio > deltatimemultub then newdt = olddt * deltatimemultub;
+    }
+    if newdt > dtmax then newdt = dtmax;
+
+    deltatime = newdt;
+  }
+
+  /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
+  if targetdt > deltatime && targetdt < (4.0/3.0 * deltatime) {
+    targetdt = 2.0/3.0 * deltatime;
+  }
+  if targetdt < deltatime then deltatime = targetdt;
+
+  time += deltatime;
+  cycle += 1;
+}
+
+inline proc LagrangeLeapFrog() {
+  /* calculate nodal forces, accelerations, velocities, positions, with
+   * applied boundary conditions and slide surface considerations */
+  LagrangeNodal();
+
+  /* calculate element quantities (i.e. velocity gradient & q), and update
+   * material states */
+  LagrangeElements();
+
+  CalcTimeConstraintsForElems();
+}
+
+
+inline proc LagrangeNodal() {
+  CalcForceForNodes();
+
+  CalcAccelerationForNodes();
+
+  ApplyAccelerationBoundaryConditionsForNodes();
+
+  CalcVelocityForNodes(deltatime, u_cut);
+
+  CalcPositionForNodes(deltatime);
+}
+
+
+inline proc LagrangeElements() {
+  CalcLagrangeElements();
+
+  /* Calculate Q.  (Monotonic q option requires communication) */
+  CalcQForElems();
+
+  ApplyMaterialPropertiesForElems();
+
+  UpdateVolumesForElems();
+}
+
+
+inline proc CalcTimeConstraintsForElems() {
+  /* evaluate time constraint */
+  CalcCourantConstraintForElems();
+
+  /* check hydro constraint */
+  CalcHydroConstraintForElems();
+}
+
+
+inline proc computeDTF(indx) {
+  const myvdov = vdov[indx];
+
+  if myvdov == 0.0 then
+    return max(real);
+
+  const myarealg = arealg[indx];
+  var dtf = ss[indx]**2;
+  if myvdov < 0.0 then
+    dtf += qqc2 * myarealg**2 * myvdov**2;
+  dtf = sqrt(dtf);
+  dtf = myarealg / dtf;
+
+  return dtf;
+}
+
+
+proc CalcCourantConstraintForElems() {
+  const val = min reduce [indx in MatElems] computeDTF(indx);
+
+  if (val != max(real)) then
+    dtcourant = val;
+}
+
+
+inline proc calcDtHydroTmp(indx) {
+  const myvdov = vdov[indx];
+  if (myvdov == 0.0) then
+    return max(real);
+  else
+    return dvovmax / (abs(myvdov)+1.0e-20);
+}
+
+
+proc CalcHydroConstraintForElems() {
+  const val = min reduce [indx in MatElems] calcDtHydroTmp(indx);
+
+  if (val != max(real)) then
+    dthydro = val;
+}
+
+
+/* calculate nodal forces, accelerations, velocities, positions, with
+ * applied boundary conditions and slide surface considerations */
+
+proc CalcForceForNodes() {
+  //zero out all forces
+  forall x in fx do x.write(0);
+  forall y in fy do y.write(0);
+  forall z in fz do z.write(0);
+
+  /* Calcforce calls partial, force, hourq */
+  CalcVolumeForceForElems();
+
+  /* Calculate Nodal Forces at domain boundaries */
+  // this was commented out in C++ code, so left out here
+}
+
+proc CalcVolumeForceForElems() {
+  var sigxx, sigyy, sigzz, determ: [Elems] real;
+
+  /* Sum contributions to total stress tensor */
+  InitStressTermsForElems(p, q, sigxx, sigyy, sigzz);
+
+  /* call elemlib stress integration loop to produce nodal forces from
+     material stresses. */
+  IntegrateStressForElems(sigxx, sigyy, sigzz, determ);
+
+  /* check for negative element volume */
+  forall e in Elems {
+    if determ[e] <= 0.0 then
+      halt("can't have negative volume (determ[", e, "]=", determ[e], ")");
+  }
+
+  CalcHourglassControlForElems(determ);
+}
+
+
+proc IntegrateStressForElems(sigxx, sigyy, sigzz, determ) {
+  forall k in Elems {
+    var b_x, b_y, b_z: 8*real;
+    var x_local, y_local, z_local: 8*real;
+    localizeNeighborNodes(k, x, x_local, y, y_local, z, z_local);
+
+    var fx_local, fy_local, fz_local: 8*real;
+
+    local {
+      /* Volume calculation involves extra work for numerical consistency. */
+      CalcElemShapeFunctionDerivatives(x_local, y_local, z_local, 
+                                       b_x, b_y, b_z, determ[k]);
+    
+      CalcElemNodeNormals(b_x, b_y, b_z, x_local, y_local, z_local);
+
+      SumElemStressesToNodeForces(b_x, b_y, b_z, sigxx[k], sigyy[k], sigzz[k], 
+                                  fx_local, fy_local, fz_local);
+    }
+
+    for (noi, t) in elemToNodesTuple(k) {
+      fx[noi].add(fx_local[t]);
+      fy[noi].add(fy_local[t]);
+      fz[noi].add(fz_local[t]);
+    }
+  }
+}
+
+
+proc CalcHourglassControlForElems(determ) {
+  var dvdx, dvdy, dvdz, x8n, y8n, z8n: [Elems] 8*real;
+
+  forall eli in Elems {
+    /* Collect domain nodes to elem nodes */
+    var x1, y1, z1: 8*real;
+    localizeNeighborNodes(eli, x, x1, y, y1, z, z1);
+    var pfx, pfy, pfz: 8*real;
+
+    local {
+      /* load into temporary storage for FB Hour Glass control */
+      (dvdx[eli], dvdy[eli], dvdz[eli]) = CalcElemVolumeDerivative(x1, y1, z1);
+    }
+
+    x8n[eli]  = x1;
+    y8n[eli]  = y1;
+    z8n[eli]  = z1;
+
+    determ[eli] = volo[eli] * v[eli];
+  }
+
+  /* Do a check for negative volumes */
+  forall e in Elems {
+    if v[e] <= 0.0 then
+      halt("can't have negative (or zero) volume. (in Hourglass, v[", e, "]=", v[e], ")");
+  }
+
+  if hgcoef > 0.0 {
+    CalcFBHourglassForceForElems(determ, x8n, y8n, z8n, dvdx, dvdy, dvdz);
+  }
+}
+
+
+const gammaCoef: 4*(8*real) = // WAS: [1..4, 1..8] real = 
+                (( 1.0,  1.0, -1.0, -1.0, -1.0, -1.0,  1.0,  1.0),
+                 ( 1.0, -1.0, -1.0,  1.0, -1.0,  1.0,  1.0, -1.0),
+                 ( 1.0, -1.0,  1.0, -1.0,  1.0, -1.0,  1.0, -1.0),
+                 (-1.0,  1.0, -1.0,  1.0,  1.0, -1.0,  1.0, -1.0));
+
+/* Calculates the Flanagan-Belytschko anti-hourglass force. */
+proc CalcFBHourglassForceForElems(determ, x8n, y8n, z8n, dvdx, dvdy, dvdz) {
+
+  /* compute the hourglass modes */
+  forall eli in Elems {
+    var hourgam: 8*(4*real);
+    const volinv = 1.0 / determ[eli];
+    var ss1, mass1, volume13: real;
+    var hgfx, hgfy, hgfz: 8*real;
+    var coefficient: real;
+
+    var xd1, yd1, zd1: 8*real;
+    localizeNeighborNodes(eli, xd, xd1, yd, yd1, zd, zd1);
+
+    /* TODO: Can we enable this local block? */
+    // local {
+      for param i in 1..4 {
+        var hourmodx, hourmody, hourmodz: real;
+        // reduction
+        for param j in 1..8 {
+          hourmodx += x8n[eli][j] * gammaCoef[i][j];
+          hourmody += y8n[eli][j] * gammaCoef[i][j];
+          hourmodz += z8n[eli][j] * gammaCoef[i][j];
+        }
+
+        for param j in 1..8 {
+          hourgam[j][i] = gammaCoef[i][j] - volinv * 
+            (dvdx[eli][j] * hourmodx +
+             dvdy[eli][j] * hourmody +
+             dvdz[eli][j] * hourmodz);
+        }
+      }
+
+      /* compute forces */
+      /* store forces into h arrays (force arrays) */
+      ss1 = ss[eli];
+      mass1 = elemMass[eli];
+      volume13 = cbrt(determ[eli]);
+
+      coefficient = - hgcoef * 0.01 * ss1 * mass1 / volume13;
+
+      CalcElemFBHourglassForce(xd1, yd1, zd1, hourgam, coefficient,
+                               hgfx, hgfy, hgfz);
+      // } // end local
+
+    for (noi,i) in elemToNodesTuple(eli) {
+      fx[noi].add(hgfx[i]);
+      fy[noi].add(hgfy[i]);
+      fz[noi].add(hgfz[i]);
+    }
+  }
+}
+
+
+proc CalcAccelerationForNodes() {
+  forall noi in Nodes do local {
+      xdd[noi] = fx[noi].read() / nodalMass[noi];
+      ydd[noi] = fy[noi].read() / nodalMass[noi];
+      zdd[noi] = fz[noi].read() / nodalMass[noi];
+    }
+}
+
+
+proc ApplyAccelerationBoundaryConditionsForNodes() {
+  // TODO: Shouldn't we be able to write these as follows?
+  //
+  // xdd[XSym] = 0.0;
+  // ydd[YSym] = 0.0;
+  // zdd[ZSym] = 0.0;
+  
+  forall x in XSym do xdd[x] = 0.0;
+  forall y in YSym do ydd[y] = 0.0;
+  forall z in ZSym do zdd[z] = 0.0;
+}
+
+
+proc CalcVelocityForNodes(dt: real, u_cut: real) {
+  forall i in Nodes do local {
+    var xdtmp = xd[i] + xdd[i] * dt,
+        ydtmp = yd[i] + ydd[i] * dt,
+        zdtmp = zd[i] + zdd[i] * dt;
+    if abs(xdtmp) < u_cut then xdtmp = 0.0;
+    if abs(ydtmp) < u_cut then ydtmp = 0.0;
+    if abs(zdtmp) < u_cut then zdtmp = 0.0;
+    xd[i] = xdtmp;
+    yd[i] = ydtmp;
+    zd[i] = zdtmp;
+  }
+}
+
+
+proc CalcPositionForNodes(dt: real) {
+  forall ijk in Nodes {
+    x[ijk] += xd[ijk] * dt;
+    y[ijk] += yd[ijk] * dt;
+    z[ijk] += zd[ijk] * dt;
+  }
+}
+
+// sungeun: Temporary array reused throughout
+proc CalcLagrangeElements() {
+  var dxx, dyy, dzz: [Elems] real;
+
+  CalcKinematicsForElems(dxx, dyy, dzz, deltatime);
+
+  // element loop to do some stuff not included in the elemlib function.
+  forall k in Elems do local {
+    vdov[k] = dxx[k] + dyy[k] + dzz[k];
+    var vdovthird = vdov[k] / 3.0;
+    dxx[k] -= vdovthird;
+    dyy[k] -= vdovthird;
+    dzz[k] -= vdovthird;
+  }
+
+  // See if any volumes are negative, and take appropriate action.
+  forall e in Elems {
+    if vnew[e] <= 0.0 then
+      halt("can't have negative volume (vnew[", e, "]=", vnew[e], ")");
+  }
+}
+
+
+proc CalcKinematicsForElems(dxx, dyy, dzz, const dt: real) {
+  // loop over all elements
+  forall k in Elems {
+    var b_x, b_y, b_z: 8*real,
+        d: 6*real,
+        detJ: real;
+
+    //get nodal coordinates from global arrays and copy into local arrays
+    var x_local, y_local, z_local: 8*real;
+    localizeNeighborNodes(k, x, x_local, y, y_local, z, z_local);
+
+    //get nodal velocities from global arrays and copy into local arrays
+    var xd_local, yd_local, zd_local: 8*real;
+    localizeNeighborNodes(k, xd, xd_local, yd, yd_local, zd, zd_local);
+    var dt2 = 0.5 * dt; //wish this was local, too...
+
+    local {
+      //volume calculations
+      const volume = CalcElemVolume(x_local, y_local, z_local);
+      const relativeVolume = volume / volo[k];
+      vnew[k] = relativeVolume;
+      delv[k] = relativeVolume - v[k];
+
+      //set characteristic length
+      arealg[k] = CalcElemCharacteristicLength(x_local, y_local, z_local,
+                                               volume);
+
+      for param i in 1..8 {
+        x_local[i] -= dt2 * xd_local[i];
+        y_local[i] -= dt2 * yd_local[i];
+        z_local[i] -= dt2 * zd_local[i];
+      }
+
+      CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
+                                       b_x, b_y, b_z, detJ);
+
+      CalcElemVelocityGradient(xd_local, yd_local, zd_local, b_x, b_y, b_z,
+                               detJ, d);
+
+    }
+
+    // put velocity gradient quantities into their global arrays.
+    dxx[k] = d[1];
+    dyy[k] = d[2];
+    dzz[k] = d[3];
+  }
+}
+
+
+// sungeun: Temporary array reused throughout
+/* velocity gradient */
+var delv_xi, delv_eta, delv_zeta: [Elems] real;
+/* position gradient */
+var delx_xi, delx_eta, delx_zeta: [Elems] real;
+
+proc CalcQForElems() {
+  // MONOTONIC Q option
+
+  /* Calculate velocity gradients */
+  CalcMonotonicQGradientsForElems(delv_xi, delv_eta, delv_zeta, 
+                                  delx_xi, delx_eta, delx_zeta);
+
+  /* Transfer veloctiy gradients in the first order elements */
+  /* problem->commElements->Transfer(CommElements::monoQ) ; */
+  CalcMonotonicQForElems(delv_xi, delv_eta, delv_zeta,
+                         delx_xi, delx_eta, delx_zeta);
+
+  /* Don't allow excessive artificial viscosity */
+  forall e in Elems {
+    if q[e] > qstop then
+      halt("Excessive artificial viscosity!  (q[", e, "]=", q[e], ")");
+  }
+}
+
+
+// sungeun: Temporary array reused throughout
+var vnewc: [MatElems] real;
+
+/* Expose all of the variables needed for material evaluation */
+proc ApplyMaterialPropertiesForElems() {
+
+  forall i in MatElems do vnewc[i] = vnew[i];
+
+  if eosvmin != 0.0 then
+    [c in vnewc] if c < eosvmin then c = eosvmin;
+
+  if eosvmax != 0.0 then
+    [c in vnewc] if c > eosvmax then c = eosvmax;
+
+
+  // old comment: The following loop should compute min/max reductions;
+  // currently, race-y
+
+  forall matelm in MatElems {
+    var vc = v[matelm];
+    if eosvmin != 0.0 && vc < eosvmin then vc = eosvmin;
+    if eosvmax != 0.0 && vc > eosvmax then vc = eosvmax;
+    if vc <= 0.0 {
+      writeln("Volume error (in ApplyMaterialProperiesForElems).");
+      exit(1);
+    }
+  }
+
+  EvalEOSForElems(vnewc);
+}
+
+
+proc UpdateVolumesForElems() {
+  forall i in Elems do local {
+    var tmpV = vnew[i];
+    if abs(tmpV-1.0) < v_cut then tmpV = 1.0;
+    v[i] = tmpV;
+  }
+}
+
+
+proc CalcMonotonicQGradientsForElems(delv_xi, delv_eta, delv_zeta, 
+                                     delx_xi, delx_eta, delx_zeta) {
+  forall eli in Elems {
+    const ptiny = 1.0e-36;
+    var xl, yl, zl: 8*real;
+    localizeNeighborNodes(eli, x, xl, y, yl, z, zl);
+    var xvl, yvl, zvl: 8*real;
+    localizeNeighborNodes(eli, xd, xvl, yd, yvl, zd, zvl);
+
+    local {
+      const vol = volo[eli] * vnew[eli],
+            norm = 1.0 / (vol + ptiny);
+      var ax, ay, az, dxv, dyv, dzv: real;
+
+      const dxj = -0.25*((xl[1]+xl[2]+xl[6]+xl[5])-(xl[4]+xl[3]+xl[7]+xl[8])),
+            dyj = -0.25*((yl[1]+yl[2]+yl[6]+yl[5])-(yl[4]+yl[3]+yl[7]+yl[8])),
+            dzj = -0.25*((zl[1]+zl[2]+zl[6]+zl[5])-(zl[4]+zl[3]+zl[7]+zl[8])),
+      
+            dxi =  0.25*((xl[2]+xl[3]+xl[7]+xl[6])-(xl[1]+xl[4]+xl[8]+xl[5])),
+            dyi =  0.25*((yl[2]+yl[3]+yl[7]+yl[6])-(yl[1]+yl[4]+yl[8]+yl[5])),
+            dzi =  0.25*((zl[2]+zl[3]+zl[7]+zl[6])-(zl[1]+zl[4]+zl[8]+zl[5])),
+        
+            dxk =  0.25*((xl[5]+xl[6]+xl[7]+xl[8])-(xl[1]+xl[2]+xl[3]+xl[4])),
+            dyk =  0.25*((yl[5]+yl[6]+yl[7]+yl[8])-(yl[1]+yl[2]+yl[3]+yl[4])),
+            dzk =  0.25*((zl[5]+zl[6]+zl[7]+zl[8])-(zl[1]+zl[2]+zl[3]+zl[4]));
+
+      /* find delvk and delxk ( i cross j ) */
+
+      ax = dyi*dzj - dzi*dyj;
+      ay = dzi*dxj - dxi*dzj;
+      az = dxi*dyj - dyi*dxj;
+
+      delx_zeta[eli] = vol / sqrt(ax*ax + ay*ay + az*az + ptiny);
+
+      ax *= norm;
+      ay *= norm;
+      az *= norm;
+
+      dxv = 0.25*((xvl[5]+xvl[6]+xvl[7]+xvl[8])-(xvl[1]+xvl[2]+xvl[3]+xvl[4]));
+      dyv = 0.25*((yvl[5]+yvl[6]+yvl[7]+yvl[8])-(yvl[1]+yvl[2]+yvl[3]+yvl[4]));
+      dzv = 0.25*((zvl[5]+zvl[6]+zvl[7]+zvl[8])-(zvl[1]+zvl[2]+zvl[3]+zvl[4]));
+
+      delv_zeta[eli] = ax*dxv + ay*dyv + az*dzv;
+
+      /* find delxi and delvi ( j cross k ) */
+
+      ax = dyj*dzk - dzj*dyk;
+      ay = dzj*dxk - dxj*dzk;
+      az = dxj*dyk - dyj*dxk;
+
+      delx_xi[eli] = vol / sqrt(ax*ax + ay*ay + az*az + ptiny) ;
+
+      ax *= norm;
+      ay *= norm;
+      az *= norm;
+
+      dxv = 0.25*((xvl[2]+xvl[3]+xvl[7]+xvl[6])-(xvl[1]+xvl[4]+xvl[8]+xvl[5]));
+      dyv = 0.25*((yvl[2]+yvl[3]+yvl[7]+yvl[6])-(yvl[1]+yvl[4]+yvl[8]+yvl[5]));
+      dzv = 0.25*((zvl[2]+zvl[3]+zvl[7]+zvl[6])-(zvl[1]+zvl[4]+zvl[8]+zvl[5]));
+
+      delv_xi[eli] = ax*dxv + ay*dyv + az*dzv ;
+
+      /* find delxj and delvj ( k cross i ) */
+
+      ax = dyk*dzi - dzk*dyi;
+      ay = dzk*dxi - dxk*dzi;
+      az = dxk*dyi - dyk*dxi;
+
+      delx_eta[eli] = vol / sqrt(ax*ax + ay*ay + az*az + ptiny) ;
+
+      ax *= norm;
+      ay *= norm;
+      az *= norm;
+
+      dxv= -0.25*((xvl[1]+xvl[2]+xvl[6]+xvl[5])-(xvl[4]+xvl[3]+xvl[7]+xvl[8]));
+      dyv= -0.25*((yvl[1]+yvl[2]+yvl[6]+yvl[5])-(yvl[4]+yvl[3]+yvl[7]+yvl[8]));
+      dzv= -0.25*((zvl[1]+zvl[2]+zvl[6]+zvl[5])-(zvl[4]+zvl[3]+zvl[7]+zvl[8]));
+
+      delv_eta[eli] = ax*dxv + ay*dyv + az*dzv ;
+    } /* local */
+  } /* forall eli */
+}
+
+
+proc CalcMonotonicQForElems(delv_xi, delv_eta, delv_zeta, 
+                            delx_xi, delx_eta, delx_zeta) {
+  //got rid of call through to "CalcMonotonicQRegionForElems"
+
+  forall i in MatElems {
+    const ptiny = 1.0e-36;
+    const bcMask = elemBC[i];
+    var norm, delvm, delvp: real;
+
+    /* phixi */
+    norm = 1.0 / (delv_xi[i] + ptiny);
+
+    select bcMask & XI_M {
+      when 0         do delvm = delv_xi[lxim(i)];
+      when XI_M_SYMM do delvm = delv_xi[i];
+      when XI_M_FREE do delvm = 0.0;
+    }
+    select bcMask & XI_P {
+      when 0         do delvp = delv_xi[lxip(i)];
+      when XI_P_SYMM do delvp = delv_xi[i];
+      when XI_P_FREE do delvp = 0.0;
+    }
+
+    delvm *= norm;
+    delvp *= norm;
+
+    var phixi = 0.5 * (delvm + delvp);
+
+    delvm *= monoq_limiter_mult;
+    delvp *= monoq_limiter_mult;
+
+    if delvm < phixi           then phixi = delvm;
+    if delvp < phixi           then phixi = delvp;
+    if phixi < 0               then phixi = 0.0;
+    if phixi > monoq_max_slope then phixi = monoq_max_slope;
+
+    /* phieta */
+    norm = 1.0 / (delv_eta[i] + ptiny);
+
+    select bcMask & ETA_M {
+      when 0          do delvm = delv_eta[letam[i]];
+      when ETA_M_SYMM do delvm = delv_eta[i];      
+      when ETA_M_FREE do delvm = 0.0;      
+    }
+    select bcMask & ETA_P {
+      when 0          do delvp = delv_eta[letap[i]];
+      when ETA_P_SYMM do delvp = delv_eta[i];      
+      when ETA_P_FREE do delvp = 0.0;      
+    }
+
+    delvm = delvm * norm;
+    delvp = delvp * norm;
+
+    var phieta = 0.5 * (delvm + delvp);
+
+    delvm *= monoq_limiter_mult;
+    delvp *= monoq_limiter_mult;
+
+    if delvm  < phieta          then phieta = delvm;
+    if delvp  < phieta          then phieta = delvp;
+    if phieta < 0.0             then phieta = 0.0;
+    if phieta > monoq_max_slope then phieta = monoq_max_slope;
+
+    /*  phizeta     */
+    norm = 1.0 / (delv_zeta[i] + ptiny);
+
+    select bcMask & ZETA_M {
+      when 0           do delvm = delv_zeta[lzetam[i]];
+      when ZETA_M_SYMM do delvm = delv_zeta[i];       
+      when ZETA_M_FREE do delvm = 0.0;        
+    }
+    select bcMask & ZETA_P {
+      when 0           do delvp = delv_zeta[lzetap[i]];
+      when ZETA_P_SYMM do delvp = delv_zeta[i];       
+      when ZETA_P_FREE do delvp = 0.0;        
+    }
+
+    delvm = delvm * norm;
+    delvp = delvp * norm;
+
+    var phizeta = 0.5 * (delvm + delvp);
+
+    delvm *= monoq_limiter_mult;
+    delvp *= monoq_limiter_mult;
+
+    if delvm   < phizeta          then phizeta = delvm;
+    if delvp   < phizeta         then phizeta = delvp;
+    if phizeta < 0.0                 then phizeta = 0.0;
+    if phizeta > monoq_max_slope then phizeta = monoq_max_slope;
+
+    /* Remove length scale */
+    var qlin, qquad: real;
+    if vdov[i] > 0.0 {
+      qlin  = 0.0;
+      qquad = 0.0;
+    } else {
+      var delvxxi   = delv_xi[i]   * delx_xi[i],
+          delvxeta  = delv_eta[i]  * delx_eta[i],
+          delvxzeta = delv_zeta[i] * delx_zeta[i];
+
+      if delvxxi   > 0.0 then delvxxi   = 0.0;
+      if delvxeta  > 0.0 then delvxeta  = 0.0;
+      if delvxzeta > 0.0 then delvxzeta = 0.0;
+
+      const rho = elemMass[i] / (volo[i] * vnew[i]);
+
+      qlin = -qlc_monoq * rho *
+        ( delvxxi   * (1.0 - phixi) +
+          delvxeta  * (1.0 - phieta) +
+          delvxzeta * (1.0 - phizeta));
+
+      qquad = qqc_monoq * rho *
+        ( delvxxi**2   * (1.0 - phixi**2) +
+          delvxeta**2  * (1.0 - phieta**2) +
+          delvxzeta**2 * (1.0 - phizeta**2));
+    }
+    qq[i] = qquad;
+    ql[i] = qlin;
+
+  }
+}
+
+
+proc EvalEOSForElems(vnewc) {
+  const rho0 = refdens;
+
+  var e_old, delvc, p_old, q_old, compression, compHalfStep, 
+    qq_old, ql_old, work, p_new, e_new, q_new, bvc, pbvc: [Elems] real;
+
+  /* compress data, minimal set */
+  forall i in MatElems {
+    e_old[i]  = e[i];
+    delvc[i]  = delv[i];
+    p_old[i]  = p[i];
+    q_old[i]  = q[i];
+    qq_old[i] = qq[i];
+    ql_old[i] = ql[i];
+  }
+
+  //
+  // TODO: Uncomment local once sparse domain is distributed
+  //
+  forall i in Elems /* do local */ {
+    compression[i] = 1.0 / vnewc[i] - 1.0;
+    const vchalf = vnewc[i] - delvc[i] * 0.5;
+    compHalfStep[i] = 1.0 / vchalf - 1.0;
+  }
+
+  /* Check for v > eosvmax or v < eosvmin */
+  // (note: I think this was already checked for in calling function!)
+  if eosvmin != 0.0 {
+    forall i in Elems {
+      if vnewc[i] <= eosvmin then compHalfStep[i] = compression[i];
+    }
+  }
+  if eosvmax != 0.0 {
+    forall i in Elems {
+      if vnewc[i] >= eosvmax {
+        p_old[i] = 0.0;
+        compression[i] = 0.0;
+        compHalfStep[i] = 0.0;
+      }
+    }
+  }
+
+  CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc, 
+                     p_old, e_old, q_old, compression, compHalfStep, 
+                     vnewc, work, delvc, qq_old, ql_old);
+
+  forall i in MatElems {
+    p[i] = p_new[i];
+    e[i] = e_new[i];
+    q[i] = q_new[i];
+  }
+
+  CalcSoundSpeedForElems(vnewc, rho0, e_new, p_new, pbvc, bvc);
+}
+
+
+proc CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
+                        p_old, e_old, q_old, compression, compHalfStep, 
+                        vnewc, work, delvc, qq_old, ql_old) {
+  // TODO: might need to move these consts into foralls or global
+  // Otherwise, they live on Locale0 and everyone else has to do 
+  // remote reads.  OR: Check if they're remote value forwarded.
+  const rho0 = refdens; 
+  const sixth = 1.0 / 6.0;
+
+  var pHalfStep: [MatElems] real;
+
+  forall i in Elems {
+    e_new[i] = e_old[i] - 0.5 * delvc[i] * (p_old[i] + q_old[i]) 
+                        + 0.5 * work[i];
+    if e_new[i] < emin then e_new[i] = emin;
+  }
+
+  CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep,
+                       vnewc, pmin, p_cut, eosvmax);
+
+  forall i in Elems {
+    const vhalf = 1.0 / (1.0 + compHalfStep[i]);
+
+    if delvc[i] > 0.0 {
+      q_new[i] = 0.0;
+    } else {
+      var ssc = (pbvc[i] * e_new[i] + vhalf**2 * bvc[i] * pHalfStep[i]) / rho0;
+      if ssc <= 0.0 then ssc = 0.333333e-36;
+      else ssc = sqrt(ssc);
+      q_new[i] = ssc * ql_old[i] + qq_old[i];
+    }
+
+    e_new[i] += 0.5 * delvc[i]
+      * (3.0*(p_old[i] + q_old[i]) - 4.0*(pHalfStep[i] + q_new[i]));
+  }
+  forall i in Elems {
+    e_new[i] += 0.5 * work[i];
+    if abs(e_new[i] < e_cut) then e_new[i] = 0.0;
+    if e_new[i] < emin then e_new[i] = emin;
+  }
+
+  CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc, pmin,
+                       p_cut, eosvmax);
+
+  forall i in Elems {
+    var q_tilde:real;
+
+    if delvc[i] > 0.0 {
+      q_tilde = 0.0;
+    } else {
+      var ssc = (pbvc[i] * e_new[i] + vnewc[i]**2 * bvc[i] * p_new[i] ) / rho0;
+      if ssc <= 0.0 then ssc = 0.333333e-36;
+      else ssc = sqrt(ssc);
+      q_tilde = ssc * ql_old[i] + qq_old[i];
+    }
+
+    e_new[i] -= (7.0*(p_old[i] + q_old[i]) 
+                 - 8.0*(pHalfStep[i] + q_new[i]) 
+                 + (p_new[i] + q_tilde)) * delvc[i] * sixth;
+    if abs(e_new[i]) < e_cut then e_new[i] = 0.0;
+    if e_new[i] < emin then e_new[i] = emin;
+  }
+
+  CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc, pmin,
+                       p_cut, eosvmax);
+
+
+  //
+  // TODO: Uncomment local once sparse domain is distributed
+  //
+  forall i in Elems /* do local */ {
+    if delvc[i] <= 0.0 {
+      var ssc = (pbvc[i] * e_new[i] + vnewc[i]**2 * bvc[i] * p_new[i] ) / rho0;
+      if ssc <= 0.0 then ssc = 0.333333e-36;
+                    else ssc = sqrt(ssc);
+      q_new[i] = ssc * ql_old[i] + qq_old[i];
+      if abs(q_new[i]) < q_cut then q_new[i] = 0.0;
+    }
+  }
+}
+
+
+proc CalcSoundSpeedForElems(vnewc, rho0:real, enewc, pnewc, pbvc, bvc) {
+  // TODO: Open question: If we had multiple materials, should (a) ss
+  // be zeroed and accumulated into, and (b) updated atomically to
+  // avoid losing updates?  (Jeff will go back and think on this)
+  //
+  forall i in MatElems {
+    var ssTmp = (pbvc[i] * enewc[i] + vnewc[i]**2 * bvc[i] * pnewc[i]) / rho0;
+    if ssTmp <= 1.111111e-36 then ssTmp = 1.111111e-36;
+    ss[i] = sqrt(ssTmp);
+  }
+}
+
+
+iter elemToNodes(elem) {
+  for param i in 1..nodesPerElem do
+    yield elemToNode[elem][i];
+}
+                                 
+iter elemToNodesTuple(e) {
+  for i in 1..nodesPerElem do
+    yield (elemToNode[e][i], i);
+}
+
+
+proc deprint(title:string, x:[?D] real, y:[D]real, z:[D]real) {
+  writeln(title);
+  for i in D {
+    writeln(format("%3d", i), ": ", 
+            format("%3.4e", x[i]), " ", 
+            format("%3.4e", y[i]), " ", 
+            format("%3.4e", z[i]));
+  }
+}
+
+
diff --git a/samples/Chapel/nbody.chpl b/samples/Chapel/nbody.chpl
new file mode 100644
index 00000000..f46e4f68
--- /dev/null
+++ b/samples/Chapel/nbody.chpl
@@ -0,0 +1,147 @@
+/* The Computer Language Benchmarks Game
+   http://benchmarksgame.alioth.debian.org/
+
+   contributed by Albert Sidelnik
+   modified by Brad Chamberlain
+*/
+
+
+//
+// The number of timesteps to simulate; may be set via the command-line
+//
+config const n = 10000;
+
+//
+// Constants representing pi, the solar mass, and the number of days per year
+//
+const pi = 3.141592653589793,
+      solarMass = 4 * pi**2,
+      daysPerYear = 365.24;
+
+//
+// a record representing one of the bodies in the solar system
+//
+record body {
+  var pos: 3*real;
+  var v: 3*real;
+  var mass: real;  // does not change after it is set up
+}
+
+//
+// the array of bodies that we'll be simulating
+//
+var bodies = [/* sun */
+              new body(mass = solarMass),
+
+              /* jupiter */
+              new body(pos = ( 4.84143144246472090e+00,
+                              -1.16032004402742839e+00,
+                              -1.03622044471123109e-01),
+                         v = ( 1.66007664274403694e-03 * daysPerYear,
+                               7.69901118419740425e-03 * daysPerYear,
+                              -6.90460016972063023e-05 * daysPerYear),
+                      mass =   9.54791938424326609e-04 * solarMass),
+  
+              /* saturn */
+              new body(pos = ( 8.34336671824457987e+00,
+                               4.12479856412430479e+00,
+                              -4.03523417114321381e-01),
+                         v = (-2.76742510726862411e-03 * daysPerYear,
+                               4.99852801234917238e-03 * daysPerYear,
+                               2.30417297573763929e-05 * daysPerYear),
+                      mass =   2.85885980666130812e-04 * solarMass),
+
+              /* uranus */
+              new body(pos = ( 1.28943695621391310e+01,
+                              -1.51111514016986312e+01,
+                              -2.23307578892655734e-01),
+                         v = ( 2.96460137564761618e-03 * daysPerYear,
+                               2.37847173959480950e-03 * daysPerYear,
+                              -2.96589568540237556e-05 * daysPerYear),
+                      mass =   4.36624404335156298e-05 * solarMass),
+
+              /* neptune */
+              new body(pos = ( 1.53796971148509165e+01,
+                              -2.59193146099879641e+01,
+                               1.79258772950371181e-01),
+                         v = ( 2.68067772490389322e-03 * daysPerYear,
+                               1.62824170038242295e-03 * daysPerYear,
+                              -9.51592254519715870e-05 * daysPerYear),
+                      mass =   5.15138902046611451e-05 * solarMass)
+              ];
+
+//
+// the number of bodies to be simulated
+//
+const numbodies = bodies.numElements;
+
+//
+// The computation involves initializing the sun's velocity,
+// writing the initial energy, advancing the system through 'n'
+// timesteps, and writing the final energy.
+//
+proc main() {
+  initSun();
+
+  writef("%.9r\n", energy());
+  for 1..n do
+    advance(0.01);
+  writef("%.9r\n", energy());
+}
+
+//
+// compute the sun's initial velocity
+//
+proc initSun() {
+  const p = + reduce (for b in bodies do (b.v * b.mass));
+  bodies[1].v = -p / solarMass;
+}
+
+//
+// advance the positions and velocities of all the bodies
+//
+proc advance(dt) {
+  for i in 1..numbodies {
+    for j in i+1..numbodies {
+      updateVelocities(bodies[i], bodies[j]);
+      
+      inline proc updateVelocities(ref b1, ref b2) {
+        const dpos = b1.pos - b2.pos,
+               mag = dt / sqrt(sumOfSquares(dpos))**3;
+        
+        b1.v -= dpos * b2.mass * mag;
+        b2.v += dpos * b1.mass * mag;
+      }
+    }
+  }
+  
+  for b in bodies do
+    b.pos += dt * b.v;
+}
+
+//
+// compute the energy of the bodies
+//
+proc energy() {
+  var e = 0.0;
+  
+  for i in 1..numbodies {
+    const b1 = bodies[i];
+    
+    e += 0.5 * b1.mass * sumOfSquares(b1.v);
+    
+    for j in i+1..numbodies {
+      const b2 = bodies[j];
+      
+      e -= (b1.mass * b2.mass) / sqrt(sumOfSquares(b1.pos - b2.pos));
+    }
+  }
+  
+  return e;
+}
+
+//
+// a helper routine to compute the sum of squares of a 3-tuple's components
+//
+inline proc sumOfSquares(x)
+  return x(1)**2 + x(2)**2 + x(3)**2;
diff --git a/samples/Chapel/quicksort.chpl b/samples/Chapel/quicksort.chpl
new file mode 100644
index 00000000..33d9722c
--- /dev/null
+++ b/samples/Chapel/quicksort.chpl
@@ -0,0 +1,145 @@
+//
+// An example of a parallel quick sort implementation that uses
+// "cobegin" to make each recursive call in parallel and "serial" to
+// limit the number of threads.
+//
+
+use Random, Time; // for random number generation and the Timer class
+
+var timer: Timer; // to time the sort
+
+config var n: int = 2**15;      // the size of the array to be sorted
+config var thresh: int = 1;     // the recursive depth to serialize
+config var verbose: int = 0;    // print out this many elements in array
+config var timing: bool = true; // set timing to false to disable timer
+
+var A: [1..n] real; // array of real numbers
+
+//
+// initialize array with random numbers
+//
+fillRandom(A);
+
+//
+// print out front of array if verbose flag is set
+//
+if verbose > 0 then
+  writeln("A[1..", verbose, "] = ", A[1..verbose]);
+
+//
+// start timer, call parallel quick sort routine, stop timer
+//
+if timing then timer.start();
+pqsort(A, thresh);
+if timing then timer.stop();
+
+//
+// report sort time
+//
+if timing then writeln("sorted in ", timer.elapsed(), " seconds");
+
+//
+// print out front of array if verbose flag is set
+//   values should now be in sorted order
+//
+if verbose > 0 then
+  writeln("A[1..", verbose, "] = ", A[1..verbose]);
+
+//
+// verify that array is sorted or halt
+//
+for i in 2..n do
+  if A(i) < A(i-1) then
+    halt("A(", i-1, ") == ", A(i-1), " > A(", i, ") == ", A(i));
+
+writeln("verification success");
+
+//
+// pqsort -- parallel quick sort
+//
+//   arr: generic 1D array of values (real, int, ...)
+//   thresh: number of recursive calls to make before serializing
+//   low: lower bound of array to start sort at, defaults to whole array
+//   high: upper bound of array to stop sort at, defaults to whole array
+//
+proc pqsort(arr: [],
+           thresh: int,
+           low: int = arr.domain.low,
+           high: int = arr.domain.high) where arr.rank == 1 {
+
+  //
+  // base case: arr[low..high] is small enough to bubble sort
+  //
+  if high - low < 8 {
+    bubbleSort(arr, low, high);
+    return;
+  }
+
+  //
+  // determine pivot and partition arr[low..high]
+  //
+  const pivotVal = findPivot();
+  const pivotLoc = partition(pivotVal);
+
+  //
+  // make recursive calls to parallel quick sort each unsorted half of
+  // the array; if thresh is 0 or less, start executing conquer tasks
+  // serially
+  //
+  serial thresh <= 0 do cobegin {
+    pqsort(arr, thresh-1, low, pivotLoc-1);
+    pqsort(arr, thresh-1, pivotLoc+1, high);
+  }
+
+  //
+  // findPivot -- helper routine to find pivot value using simple
+  //              median-of-3 method, returns pivot value
+  //
+  proc findPivot() {
+    const mid = low + (high-low+1) / 2;
+
+    if arr(mid) < arr(low) then arr(mid) <=> arr(low);
+    if arr(high) < arr(low) then arr(high) <=> arr(low);
+    if arr(high) < arr(mid) then arr(high) <=> arr(mid);
+
+    const pivotVal = arr(mid);
+    arr(mid) = arr(high-1);
+    arr(high-1) = pivotVal;
+
+    return pivotVal;
+  }
+
+  //
+  // partition -- helper routine to partition array such that all
+  //              values less than pivot are to its left and all
+  //              values greater than pivot are to its right, returns
+  //              pivot location
+  //
+  proc partition(pivotVal) {
+    var ilo = low, ihi = high-1;
+    while (ilo < ihi) {
+      do { ilo += 1; } while arr(ilo) < pivotVal;
+      do { ihi -= 1; } while pivotVal < arr(ihi);
+      if (ilo < ihi) {
+        arr(ilo) <=> arr(ihi);
+      }
+    }
+    arr(high-1) = arr(ilo);
+    arr(ilo) = pivotVal;
+    return ilo;
+  }
+}
+
+//
+// bubbleSort -- bubble sort for base case of quick sort
+//
+//   arr: generic 1D array of values (real, int, ...)
+//   low: lower bound of array to start sort at
+//   high: upper bound of array to stop sort at
+//
+proc bubbleSort(arr: [], low: int, high: int) where arr.rank == 1 {
+  for i in low..high do
+    for j in low..high-1 do
+      if arr(j) > arr(j+1) then
+        arr(j) <=> arr(j+1);
+}