Giter Club home page Giter Club logo

Comments (9)

KristofferC avatar KristofferC commented on August 21, 2024 1

For example:

Tensors.jl

julia> @code_native t2 ⋅ t2
        .section        __TEXT,__text,regular,pure_instructions
        vmovupd (%rsi), %xmm0
        vmovsd  16(%rsi), %xmm1         ## xmm1 = mem[0],zero
        vinsertf128     $1, %xmm1, %ymm0, %ymm0
        vmovupd 24(%rsi), %xmm1
        vmovsd  40(%rsi), %xmm2         ## xmm2 = mem[0],zero
        vinsertf128     $1, %xmm2, %ymm1, %ymm1
        vmovupd 48(%rsi), %xmm2
        vmovsd  64(%rsi), %xmm3         ## xmm3 = mem[0],zero
        vinsertf128     $1, %xmm3, %ymm2, %ymm2
        vbroadcastsd    (%rdx), %ymm3
        vmulpd  %ymm3, %ymm0, %ymm3
        vbroadcastsd    8(%rdx), %ymm4
        vfmadd213pd     %ymm3, %ymm1, %ymm4
        vbroadcastsd    16(%rdx), %ymm3
        vfmadd213pd     %ymm4, %ymm2, %ymm3
        vbroadcastsd    24(%rdx), %ymm4
        vmulpd  %ymm4, %ymm0, %ymm4
        vbroadcastsd    32(%rdx), %ymm5
        vfmadd213pd     %ymm4, %ymm1, %ymm5
        vbroadcastsd    40(%rdx), %ymm4
        vfmadd213pd     %ymm5, %ymm2, %ymm4
        vbroadcastsd    48(%rdx), %ymm5
        vmulpd  %ymm5, %ymm0, %ymm0
        vbroadcastsd    56(%rdx), %ymm5
        vfmadd213pd     %ymm0, %ymm1, %ymm5
        vbroadcastsd    64(%rdx), %ymm0
        vfmadd213pd     %ymm5, %ymm2, %ymm0
        vbroadcastsd    %xmm4, %ymm1
        vblendpd        $8, %ymm1, %ymm3, %ymm1 ## ymm1 = ymm3[0,1,2],ymm1[3]
        vmovupd %ymm1, (%rdi)
        vextractf128    $1, %ymm0, %xmm1
        vinsertf128     $1, %xmm0, %ymm0, %ymm0
        vpermpd $233, %ymm4, %ymm2      ## ymm2 = ymm4[1,2,2,3]
        vblendpd        $12, %ymm0, %ymm2, %ymm0 ## ymm0 = ymm2[0,1],ymm0[2,3]
        vmovupd %ymm0, 32(%rdi)
        vmovlpd %xmm1, 64(%rdi)
        movq    %rdi, %rax
        vzeroupper
        retq
        nopw    %cs:(%rax,%rax)

StaticArrays.jl:

julia> @code_native s2 * s2
        .section        __TEXT,__text,regular,pure_instructions
        vmovsd  (%rdx), %xmm0           ## xmm0 = mem[0],zero
        vmovsd  8(%rdx), %xmm7          ## xmm7 = mem[0],zero
        vmovsd  16(%rdx), %xmm6         ## xmm6 = mem[0],zero
        vmovsd  16(%rsi), %xmm11        ## xmm11 = mem[0],zero
        vmovsd  40(%rsi), %xmm12        ## xmm12 = mem[0],zero
        vmovsd  64(%rsi), %xmm9         ## xmm9 = mem[0],zero
        vmovsd  24(%rdx), %xmm3         ## xmm3 = mem[0],zero
        vmovupd (%rsi), %xmm4
        vmovupd 8(%rsi), %xmm10
        vmovhpd (%rsi), %xmm11, %xmm5   ## xmm5 = xmm11[0],mem[0]
        vinsertf128     $1, %xmm5, %ymm4, %ymm5
        vunpcklpd       %xmm3, %xmm0, %xmm1 ## xmm1 = xmm0[0],xmm3[0]
        vmovddup        %xmm0, %xmm0    ## xmm0 = xmm0[0,0]
        vinsertf128     $1, %xmm1, %ymm0, %ymm0
        vmulpd  %ymm0, %ymm5, %ymm0
        vmovsd  32(%rdx), %xmm5         ## xmm5 = mem[0],zero
        vmovupd 24(%rsi), %xmm8
        vmovhpd 24(%rsi), %xmm12, %xmm1 ## xmm1 = xmm12[0],mem[0]
        vinsertf128     $1, %xmm1, %ymm8, %ymm1
        vunpcklpd       %xmm5, %xmm7, %xmm2 ## xmm2 = xmm7[0],xmm5[0]
        vmovddup        %xmm7, %xmm7    ## xmm7 = xmm7[0,0]
        vinsertf128     $1, %xmm2, %ymm7, %ymm2
        vmulpd  %ymm2, %ymm1, %ymm1
        vaddpd  %ymm1, %ymm0, %ymm1
        vmovsd  40(%rdx), %xmm0         ## xmm0 = mem[0],zero
        vmovhpd 48(%rsi), %xmm9, %xmm2  ## xmm2 = xmm9[0],mem[0]
        vmovupd 48(%rsi), %xmm13
        vinsertf128     $1, %xmm2, %ymm13, %ymm2
        vunpcklpd       %xmm0, %xmm6, %xmm7 ## xmm7 = xmm6[0],xmm0[0]
        vmovddup        %xmm6, %xmm6    ## xmm6 = xmm6[0,0]
        vinsertf128     $1, %xmm7, %ymm6, %ymm6
        vmulpd  %ymm6, %ymm2, %ymm2
        vaddpd  %ymm2, %ymm1, %ymm14
        vmovsd  48(%rdx), %xmm1         ## xmm1 = mem[0],zero
        vmovsd  56(%rdx), %xmm2         ## xmm2 = mem[0],zero
        vmovsd  64(%rdx), %xmm6         ## xmm6 = mem[0],zero
        vinsertf128     $1, %xmm4, %ymm10, %ymm4
        vmovddup        %xmm3, %xmm3    ## xmm3 = xmm3[0,0]
        vmovddup        %xmm1, %xmm7    ## xmm7 = xmm1[0,0]
        vinsertf128     $1, %xmm7, %ymm3, %ymm3
        vmulpd  %ymm3, %ymm4, %ymm3
        vmovupd 32(%rsi), %xmm4
        vinsertf128     $1, %xmm8, %ymm4, %ymm4
        vmovddup        %xmm5, %xmm5    ## xmm5 = xmm5[0,0]
        vmovddup        %xmm2, %xmm7    ## xmm7 = xmm2[0,0]
        vinsertf128     $1, %xmm7, %ymm5, %ymm5
        vmulpd  %ymm5, %ymm4, %ymm4
        vaddpd  %ymm4, %ymm3, %ymm3
        vmovupd 56(%rsi), %xmm4
        vinsertf128     $1, %xmm13, %ymm4, %ymm4
        vmovddup        %xmm0, %xmm0    ## xmm0 = xmm0[0,0]
        vmovddup        %xmm6, %xmm5    ## xmm5 = xmm6[0,0]
        vinsertf128     $1, %xmm5, %ymm0, %ymm0
        vmulpd  %ymm0, %ymm4, %ymm0
        vaddpd  %ymm0, %ymm3, %ymm0
        vmulsd  %xmm1, %xmm11, %xmm1
        vmulsd  %xmm2, %xmm12, %xmm2
        vaddsd  %xmm2, %xmm1, %xmm1
        vmulsd  %xmm6, %xmm9, %xmm2
        vaddsd  %xmm2, %xmm1, %xmm1
        vmovupd %ymm14, (%rdi)
        vmovupd %ymm0, 32(%rdi)
        vmovsd  %xmm1, 64(%rdi)
        movq    %rdi, %rax
        vzeroupper
        retq
        nop

Perhaps StaticArrays should use muladds?

from fembasis.jl.

ahojukka5 avatar ahojukka5 commented on August 21, 2024

I agree. To do this, we should support both, build-in arrays and static arrays. It would be very interesting to measure and compare performance of functions.

from fembasis.jl.

haampie avatar haampie commented on August 21, 2024

The only problem is that static arrays are immutable, so the eval_basis! functions cannot mutate the static array. I didn't really look at how eval_basis! was being used, so therefore I closed the issue in the end.

from fembasis.jl.

ahojukka5 avatar ahojukka5 commented on August 21, 2024

Yes but it's not a big problem, we can also generate eval_basis and eval_dbasis, here is the performance penalty. I actually tried to generate tuple versions but for some reason didn't get a good speedup and then forgot whole thing, maybe StaticArrays.jl would do the job better. What we need in practice is to calculate determinant of Jacobian and gradient of some quantity (displacement, temperature) with respect to X. I have been trying to optimize the performance of these operations here. I'm quite sure that we easily get more speed for this but I need some help to make it better. Here is how this is used in other code at the moment.

https://github.com/JuliaFEM/JuliaFEM.jl/blob/master/src/problems_elasticity.jl#L140

from fembasis.jl.

KristofferC avatar KristofferC commented on August 21, 2024

I think using https://github.com/KristofferC/Tensors.jl/blob/master/src/Tensors.jl would be better for this since they come with many convenience functions that you want to use in FEM. It also has real support for operations with 4-th order tensors and 2-nd and avoids using e.g. Voigt notation. I can give it a try.

from fembasis.jl.

KristofferC avatar KristofferC commented on August 21, 2024

Tensors also tend to have better performance

Scalar product between two rank 2 tensors

# Tensors.jl
julia> @btime $t2 ⋅ $t2
  4.397 ns (0 allocations: 0 bytes)

# StaticArrays.jl
julia> @btime $s2 * $s2
  10.063 ns (0 allocations: 0 bytes)

4th order tensor double contracted with second order

# Tensors.jl
julia> @btime $t4 ⊡ $t2
  9.080 ns (0 allocations: 0 bytes)

# StaticArrays.jl
julia> @btime $s4 * $s2
  11.577 ns (0 allocations: 0 bytes)

from fembasis.jl.

haampie avatar haampie commented on August 21, 2024

How did you achieve this exactly? Shouldn't StaticArrays.jl be on par?

from fembasis.jl.

KristofferC avatar KristofferC commented on August 21, 2024

SIMD.jl

https://github.com/KristofferC/Tensors.jl/blob/master/src/simd.jl

from fembasis.jl.

TeroFrondelius avatar TeroFrondelius commented on August 21, 2024

I like the idea of using Tensors.jl. The source code should be as close to mathematical equations found from scientific articles as possible. Then the domain specific research can understand the source code as easily as possible.

from fembasis.jl.

Related Issues (18)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.