The Dart backend has a new class float32vec that encapsulates a list of Float32x4 SIMD vectors. The class acts like a normal vector allowing you to do element wise operations and look-ups, while under-the-hood it will index and operate on the appropriate sub-vector. This allows you to write SIMD accelerated code and not have to manually break things apart into chunks of four. Looping over the sub-vectors and encapsulation adds some overhead and slows down performance for arrays with more than 32 elements. The micro benchmark above was performed with 32 elements and shows better performance than CPython with NumPy.
float32vec
class float32vec: def __init__(self, items): self[...] = new( List() ) self.length = items.length i = 0; s = 0 while i < items.length: x = items[i] y = items[i+1] z = items[i+2] w = items[i+3] vec = new( Float32x4(x,y,z,w) ) self[...].add( vec ) i += 4 def __getitem__(self, index): if index < 0: index = self.length + index float32x4 vec = self[...][ index // 4 ] lane = index % 4 if lane == 0: return vec.x elif lane == 1: return vec.y elif lane == 2: return vec.z elif lane == 3: return vec.w def __setitem__(self, index, value): if index < 0: index = self.length + index vec = self[...][ index // 4 ] lane = index % 4 if lane == 0: vec = vec.withX(value) elif lane == 1: vec = vec.withY(value) elif lane == 2: vec = vec.withZ(value) elif lane == 3: vec = vec.withW(value) self[...][ index // 4 ] = vec def __add__(self, other): arr = new( List() ) for i, vec1 in enumerate( self[...] ): vec2 = other[...][ i ] arr.add( vec1+vec2 ) v = inline("new float32vec([])") v.length = self.length v[...] = arr return v def __mul__(self, other): arr = new( List() ) for i, vec1 in enumerate( self[...] ): vec2 = other[...][ i ] arr.add( vec1*vec2 ) v = inline("new float32vec([])") v.length = self.length v[...] = arr return v
No comments:
Post a Comment