FAQ
Seems the simplest web server printing just "Hello World" is not comparable
to nginx, apache, etc. I can get about 3000 request/second for Go with max
procs set to number of CPUs, but easily get nginx serving a larger static
file about 5000 request/second.

I profiled the program. Here is the result. I added the source code at the
end also. Can someone please explain why the syscall.Syscall and
runtime.futex are taking so much cycles? Can it be improved?

(pprof) top
Total: 6053 samples
2243 37.1% 37.1% 2249 37.2% syscall.Syscall
2153 35.6% 72.6% 2153 35.6% runtime.futex
443 7.3% 79.9% 446 7.4% syscall.Syscall6
92 1.5% 81.5% 92 1.5% bytes.IndexByte
86 1.4% 82.9% 159 2.6% scanblock
81 1.3% 84.2% 81 1.3% syscall.RawSyscall6
68 1.1% 85.3% 68 1.1% runtime.usleep
37 0.6% 86.0% 67 1.1% sweep
35 0.6% 86.5% 35 0.6% runtime.memmove
33 0.5% 87.1% 74 1.2% runtime.MCache_Alloc
(pprof) top --cum
Total: 6053 samples
0 0.0% 0.0% 5350 88.4% schedunlock
3 0.0% 0.0% 2957 48.9% net/http.(*conn).serve
2243 37.1% 37.1% 2249 37.2% syscall.Syscall
2153 35.6% 72.7% 2153 35.6% runtime.futex
0 0.0% 72.7% 1715 28.3% runtime.futexwakeup
0 0.0% 72.7% 1706 28.2% runtime.notewakeup
1 0.0% 72.7% 1638 27.1% net/http.(*response).finishRequest
1 0.0% 72.7% 1628 26.9% bufio.(*Writer).Flush
1 0.0% 72.7% 1628 26.9% net.(*TCPConn).Write
2 0.0% 72.8% 1628 26.9% net.(*netFD).Write

The source code server.go:

package main

import (
"flag"
"log"
"net/http"
"os"
"runtime"
"runtime/pprof"
"time"
)

func HelloServer(w http.ResponseWriter, req *http.Request) {
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte("hello, world!\n"))
}

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

func main() {
runtime.GOMAXPROCS(runtime.NumCPU())

flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
go func() {
time.Sleep(100 * time.Second)
pprof.StopCPUProfile()
}()
}

http.HandleFunc("/", HelloServer)

srv := &http.Server{
Addr: ":8080",
Handler: http.DefaultServeMux,
ReadTimeout: time.Duration(5) * time.Second,
}
srv.ListenAndServe()

}

--

Search Discussions

  • Jesse McNelis at Oct 13, 2012 at 11:41 am

    On Sat, Oct 13, 2012 at 10:09 PM, ChrisLu wrote:
    I profiled the program. Here is the result. I added the source code at the
    end also. Can someone please explain why the syscall.Syscall and
    runtime.futex are taking so much cycles? Can it be improved?
    It's probably the scheduler. When you're doing no calculations having
    more threads will usually degrade performance.
    The goroutines spend all their time moving between threads.
    You'll likely get better performance with GOMAXPROCS=1

    By default the http pkg uses chunked mode if you don't specify a
    content length, chunked mode has some overhead.
    (pprof) top
    Total: 6053 samples
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.6% 2153 35.6% runtime.futex
    443 7.3% 79.9% 446 7.4% syscall.Syscall6
    92 1.5% 81.5% 92 1.5% bytes.IndexByte
    86 1.4% 82.9% 159 2.6% scanblock
    81 1.3% 84.2% 81 1.3% syscall.RawSyscall6
    68 1.1% 85.3% 68 1.1% runtime.usleep
    37 0.6% 86.0% 67 1.1% sweep
    35 0.6% 86.5% 35 0.6% runtime.memmove
    33 0.5% 87.1% 74 1.2% runtime.MCache_Alloc
    (pprof) top --cum
    Total: 6053 samples
    0 0.0% 0.0% 5350 88.4% schedunlock
    3 0.0% 0.0% 2957 48.9% net/http.(*conn).serve
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.7% 2153 35.6% runtime.futex
    0 0.0% 72.7% 1715 28.3% runtime.futexwakeup
    0 0.0% 72.7% 1706 28.2% runtime.notewakeup
    1 0.0% 72.7% 1638 27.1% net/http.(*response).finishRequest
    1 0.0% 72.7% 1628 26.9% bufio.(*Writer).Flush
    1 0.0% 72.7% 1628 26.9% net.(*TCPConn).Write
    2 0.0% 72.8% 1628 26.9% net.(*netFD).Write

    The source code server.go:

    package main

    import (
    "flag"
    "log"
    "net/http"
    "os"
    "runtime"
    "runtime/pprof"
    "time"
    )

    func HelloServer(w http.ResponseWriter, req *http.Request) {
    w.Header().Set("Content-Type", "text/plain")
    w.Write([]byte("hello, world!\n"))
    }

    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

    func main() {
    runtime.GOMAXPROCS(runtime.NumCPU())

    flag.Parse()
    if *cpuprofile != "" {
    f, err := os.Create(*cpuprofile)
    if err != nil {
    log.Fatal(err)
    }
    pprof.StartCPUProfile(f)
    go func() {
    time.Sleep(100 * time.Second)
    pprof.StopCPUProfile()
    }()
    }

    http.HandleFunc("/", HelloServer)

    srv := &http.Server{
    Addr: ":8080",
    Handler: http.DefaultServeMux,
    ReadTimeout: time.Duration(5) * time.Second,
    }
    srv.ListenAndServe()

    }

    --


    --
    =====================
    http://jessta.id.au

    --
  • ChrisLu at Oct 13, 2012 at 7:57 pm
    Seems setting the go max procs to 1 does not affect the performance much at
    all. And the profiling results remain almost the same.

    I tried setting the content length also. But seems no effect on performance
    and profiling either. I think since the chunk size is so small, the
    performance effect does not show up.

    Chris

    (pprof) top
    Total: 7143 samples
    2670 37.4% 37.4% 2836 39.7% syscall.Syscall
    2561 35.9% 73.2% 2561 35.9% runtime.futex
    548 7.7% 80.9% 586 8.2% syscall.Syscall6
    109 1.5% 82.4% 109 1.5% bytes.IndexByte
    106 1.5% 83.9% 106 1.5% syscall.RawSyscall6
    96 1.3% 85.3% 163 2.3% scanblock
    60 0.8% 86.1% 60 0.8% runtime.usleep
    48 0.7% 86.8% 80 1.1% sweep
    34 0.5% 87.2% 346 4.8% runtime.mallocgc
    32 0.4% 87.7% 70 1.0% runtime.MCache_Alloc
    (pprof) top --cum
    Total: 7143 samples
    0 0.0% 0.0% 6350 88.9% schedunlock
    3 0.0% 0.0% 3572 50.0% net/http.(*conn).serve
    2670 37.4% 37.4% 2836 39.7% syscall.Syscall
    2561 35.9% 73.3% 2561 35.9% runtime.futex
    5 0.1% 73.3% 2035 28.5% runtime.futexwakeup
    0 0.0% 73.3% 2019 28.3% runtime.notewakeup
    2 0.0% 73.4% 1953 27.3% net/http.(*response).finishRequest
    4 0.1% 73.4% 1948 27.3% net.(*TCPConn).Write
    1 0.0% 73.4% 1947 27.3% bufio.(*Writer).Flush
    2 0.0% 73.5% 1944 27.2% net.(*netFD).Write

    On Saturday, October 13, 2012 4:41:30 AM UTC-7, Jesse McNelis wrote:

    On Sat, Oct 13, 2012 at 10:09 PM, ChrisLu <chri...@gmail.com <javascript:>>
    wrote:
    I profiled the program. Here is the result. I added the source code at the
    end also. Can someone please explain why the syscall.Syscall and
    runtime.futex are taking so much cycles? Can it be improved?
    It's probably the scheduler. When you're doing no calculations having
    more threads will usually degrade performance.
    The goroutines spend all their time moving between threads.
    You'll likely get better performance with GOMAXPROCS=1

    By default the http pkg uses chunked mode if you don't specify a
    content length, chunked mode has some overhead.
    (pprof) top
    Total: 6053 samples
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.6% 2153 35.6% runtime.futex
    443 7.3% 79.9% 446 7.4% syscall.Syscall6
    92 1.5% 81.5% 92 1.5% bytes.IndexByte
    86 1.4% 82.9% 159 2.6% scanblock
    81 1.3% 84.2% 81 1.3% syscall.RawSyscall6
    68 1.1% 85.3% 68 1.1% runtime.usleep
    37 0.6% 86.0% 67 1.1% sweep
    35 0.6% 86.5% 35 0.6% runtime.memmove
    33 0.5% 87.1% 74 1.2% runtime.MCache_Alloc
    (pprof) top --cum
    Total: 6053 samples
    0 0.0% 0.0% 5350 88.4% schedunlock
    3 0.0% 0.0% 2957 48.9% net/http.(*conn).serve
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.7% 2153 35.6% runtime.futex
    0 0.0% 72.7% 1715 28.3% runtime.futexwakeup
    0 0.0% 72.7% 1706 28.2% runtime.notewakeup
    1 0.0% 72.7% 1638 27.1%
    net/http.(*response).finishRequest
    1 0.0% 72.7% 1628 26.9% bufio.(*Writer).Flush
    1 0.0% 72.7% 1628 26.9% net.(*TCPConn).Write
    2 0.0% 72.8% 1628 26.9% net.(*netFD).Write

    The source code server.go:

    package main

    import (
    "flag"
    "log"
    "net/http"
    "os"
    "runtime"
    "runtime/pprof"
    "time"
    )

    func HelloServer(w http.ResponseWriter, req *http.Request) {
    w.Header().Set("Content-Type", "text/plain")
    w.Write([]byte("hello, world!\n"))
    }

    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
    func main() {
    runtime.GOMAXPROCS(runtime.NumCPU())

    flag.Parse()
    if *cpuprofile != "" {
    f, err := os.Create(*cpuprofile)
    if err != nil {
    log.Fatal(err)
    }
    pprof.StartCPUProfile(f)
    go func() {
    time.Sleep(100 * time.Second)
    pprof.StopCPUProfile()
    }()
    }

    http.HandleFunc("/", HelloServer)

    srv := &http.Server{
    Addr: ":8080",
    Handler: http.DefaultServeMux,
    ReadTimeout: time.Duration(5) * time.Second,
    }
    srv.ListenAndServe()

    }

    --


    --
    =====================
    http://jessta.id.au
    --
  • Rémy Oudompheng at Oct 13, 2012 at 11:41 am

    On 2012/10/13 ChrisLu wrote:
    Seems the simplest web server printing just "Hello World" is not comparable
    to nginx, apache, etc. I can get about 3000 request/second for Go with max
    procs set to number of CPUs, but easily get nginx serving a larger static
    file about 5000 request/second.

    I profiled the program. Here is the result. I added the source code at the
    end also. Can someone please explain why the syscall.Syscall and
    runtime.futex are taking so much cycles? Can it be improved?
    I think it was recently improved. But remember that optimizing a
    micro-benchmark might not be the wisest thing to do.

    Rémy.

    --
  • ChrisLu at Oct 13, 2012 at 7:17 pm
    It is much faster than before, but still not I think is not good enough
    when all it does is just "Hello World".

    Although it is a micro-benchmarking, it is a very common use case. And it
    seems related to the goroutine scheduling contention as in this bug:

    http://code.google.com/p/go/issues/detail?id=2933


    Chris
    http://weed-fs.googlecode.com
    On Saturday, October 13, 2012 4:17:55 AM UTC-7, Rémy Oudompheng wrote:
    On 2012/10/13 ChrisLu <chri...@gmail.com <javascript:>> wrote:
    Seems the simplest web server printing just "Hello World" is not
    comparable
    to nginx, apache, etc. I can get about 3000 request/second for Go with max
    procs set to number of CPUs, but easily get nginx serving a larger static
    file about 5000 request/second.

    I profiled the program. Here is the result. I added the source code at the
    end also. Can someone please explain why the syscall.Syscall and
    runtime.futex are taking so much cycles? Can it be improved?
    I think it was recently improved. But remember that optimizing a
    micro-benchmark might not be the wisest thing to do.

    Rémy.
    --
  • Rémy Oudompheng at Oct 13, 2012 at 7:25 pm

    On 2012/10/13 ChrisLu wrote:
    It is much faster than before, but still not I think is not good enough when
    all it does is just "Hello World".

    Although it is a micro-benchmarking, it is a very common use case. And it
    seems related to the goroutine scheduling contention as in this bug:

    http://code.google.com/p/go/issues/detail?id=2933
    You didn't explain the new benchmark results nor how you came to that
    conclusion.

    Rémy.

    --
  • ChrisLu at Oct 13, 2012 at 7:33 pm

    On Saturday, October 13, 2012 12:25:50 PM UTC-7, Rémy Oudompheng wrote:
    On 2012/10/13 ChrisLu <chri...@gmail.com <javascript:>> wrote:
    It is much faster than before, but still not I think is not good enough when
    all it does is just "Hello World".

    Although it is a micro-benchmarking, it is a very common use case. And it
    seems related to the goroutine scheduling contention as in this bug:

    http://code.google.com/p/go/issues/detail?id=2933
    You didn't explain the new benchmark results nor how you came to that
    conclusion.

    Rémy.
    I was referring to the benchmark difference using some Go pre 1.0 version
    one year ago, vs current Go version. See this
    link: https://groups.google.com/forum/?fromgroups=#!topic/golang-nuts/zeLMYnjO_JA

    Chris

    --
  • Jli Justinli at Oct 15, 2012 at 11:47 pm
    I'm getting the same result here.. I'm trying to build a high-performance
    web server for a particular application, but I can hardly justify it when
    it's slower than the previous C++ thread-per-connection one I'm using. Any
    more
    On Saturday, October 13, 2012 7:09:20 AM UTC-4, ChrisLu wrote:

    Seems the simplest web server printing just "Hello World" is not
    comparable to nginx, apache, etc. I can get about 3000 request/second for
    Go with max procs set to number of CPUs, but easily get nginx serving a
    larger static file about 5000 request/second.

    I profiled the program. Here is the result. I added the source code at the
    end also. Can someone please explain why the syscall.Syscall and
    runtime.futex are taking so much cycles? Can it be improved?

    (pprof) top
    Total: 6053 samples
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.6% 2153 35.6% runtime.futex
    443 7.3% 79.9% 446 7.4% syscall.Syscall6
    92 1.5% 81.5% 92 1.5% bytes.IndexByte
    86 1.4% 82.9% 159 2.6% scanblock
    81 1.3% 84.2% 81 1.3% syscall.RawSyscall6
    68 1.1% 85.3% 68 1.1% runtime.usleep
    37 0.6% 86.0% 67 1.1% sweep
    35 0.6% 86.5% 35 0.6% runtime.memmove
    33 0.5% 87.1% 74 1.2% runtime.MCache_Alloc
    (pprof) top --cum
    Total: 6053 samples
    0 0.0% 0.0% 5350 88.4% schedunlock
    3 0.0% 0.0% 2957 48.9% net/http.(*conn).serve
    2243 37.1% 37.1% 2249 37.2% syscall.Syscall
    2153 35.6% 72.7% 2153 35.6% runtime.futex
    0 0.0% 72.7% 1715 28.3% runtime.futexwakeup
    0 0.0% 72.7% 1706 28.2% runtime.notewakeup
    1 0.0% 72.7% 1638 27.1% net/http.(*response).finishRequest
    1 0.0% 72.7% 1628 26.9% bufio.(*Writer).Flush
    1 0.0% 72.7% 1628 26.9% net.(*TCPConn).Write
    2 0.0% 72.8% 1628 26.9% net.(*netFD).Write

    The source code server.go:

    package main

    import (
    "flag"
    "log"
    "net/http"
    "os"
    "runtime"
    "runtime/pprof"
    "time"
    )

    func HelloServer(w http.ResponseWriter, req *http.Request) {
    w.Header().Set("Content-Type", "text/plain")
    w.Write([]byte("hello, world!\n"))
    }

    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

    func main() {
    runtime.GOMAXPROCS(runtime.NumCPU())

    flag.Parse()
    if *cpuprofile != "" {
    f, err := os.Create(*cpuprofile)
    if err != nil {
    log.Fatal(err)
    }
    pprof.StartCPUProfile(f)
    go func() {
    time.Sleep(100 * time.Second)
    pprof.StopCPUProfile()
    }()
    }

    http.HandleFunc("/", HelloServer)

    srv := &http.Server{
    Addr: ":8080",
    Handler: http.DefaultServeMux,
    ReadTimeout: time.Duration(5) * time.Second,
    }
    srv.ListenAndServe()

    }
    --
  • Dave Cheney at Oct 15, 2012 at 10:57 pm

    I'm getting the same result here.. I'm trying to build a high-performance
    web server for a particular application, but I can hardly justify it when
    it's slower than the previous C++ thread-per-connection one I'm using. Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave

    --
  • ChrisLu at Oct 16, 2012 at 2:20 am
    The code should be just basic "Hello World", as seen in the original post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it when
    it's slower than the previous C++ thread-per-connection one I'm using. Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
  • Dave Cheney at Oct 16, 2012 at 2:37 am
    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Chris Lu at Oct 16, 2012 at 3:34 am
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris
    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original post.
    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Dave Cheney at Oct 16, 2012 at 4:44 am
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Dave Cheney at Oct 18, 2012 at 11:00 am
    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue 3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Jli Justinli at Oct 18, 2012 at 6:15 pm
    Awesome, this looks really good! Once I get home I'll try it out and let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify
    it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Dave Cheney at Oct 19, 2012 at 12:33 am
    Well, this idea still needs validation. In theory using the NB variant should reduce scheduler overhead by not informing it the goroutine is about to block. However, if write(2) does more than copy the buffer into kernel space and return the number of bytes that fit then this approach probably isn't going to improve throughput. The best way to do this is to load test and profile.
    On 19/10/2012, at 3:27, jli.justinli@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue 3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
    --
  • ChrisLu at Oct 18, 2012 at 10:16 pm
    Please see my reply(2 hours ago) for the load test and profiling with your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is about
    to block. However, if write(2) does more than copy the buffer into kernel
    space and return the number of bytes that fit then this approach probably
    isn't going to improve throughput. The best way to do this is to load test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com <javascript:> wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
  • Dave Cheney at Oct 18, 2012 at 10:36 pm
    Thanks Chris, sorry I didn't see your other reply till now. One of the causes of the high % of CPU spent in futex, I believe, is mutex contention. I'll keep investigating.
    On 19/10/2012, at 8:23, ChrisLu wrote:

    Please see my reply(2 hours ago) for the load test and profiling with your patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant should reduce scheduler overhead by not informing it the goroutine is about to block. However, if write(2) does more than copy the buffer into kernel space and return the number of bytes that fit then this approach probably isn't going to improve throughput. The best way to do this is to load test and profile.
    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue 3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
  • Dave Cheney at Oct 19, 2012 at 1:19 am
    @Chris, what program are you using to simulate the client ? Are you
    using siege like Justin ?
    On Fri, Oct 19, 2012 at 8:30 AM, Dave Cheney wrote:
    Thanks Chris, sorry I didn't see your other reply till now. One of the
    causes of the high % of CPU spent in futex, I believe, is mutex contention.
    I'll keep investigating.


    On 19/10/2012, at 8:23, ChrisLu wrote:

    Please see my reply(2 hours ago) for the load test and profiling with your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is about
    to block. However, if write(2) does more than copy the buffer into kernel
    space and return the number of bytes that fit then this approach probably
    isn't going to improve throughput. The best way to do this is to load test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
  • ChrisLu at Oct 19, 2012 at 3:20 am
    I simply use "ab -n 10000 -c 3 http://localhost:8080/".

    This is run comparing a Go helloword with Nginx. I got results like
    3600req/sec vis 5500req/sec on my computer.

    Chris
    On Thursday, October 18, 2012 6:13:04 PM UTC-7, Dave Cheney wrote:

    @Chris, what program are you using to simulate the client ? Are you
    using siege like Justin ?
    On Fri, Oct 19, 2012 at 8:30 AM, Dave Cheney wrote:
    Thanks Chris, sorry I didn't see your other reply till now. One of the
    causes of the high % of CPU spent in futex, I believe, is mutex
    contention.
    I'll keep investigating.


    On 19/10/2012, at 8:23, ChrisLu <chri...@gmail.com <javascript:>> wrote:
    Please see my reply(2 hours ago) for the load test and profiling with your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is
    about
    to block. However, if write(2) does more than copy the buffer into
    kernel
    space and return the number of bytes that fit then this approach
    probably
    isn't going to improve throughput. The best way to do this is to load
    test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and
    let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
  • Dave Cheney at Oct 19, 2012 at 3:23 am
    That explains why you are spending so much time in syscall.Close, ab
    uses http/1.0 mode without persistent connections.
    On Fri, Oct 19, 2012 at 2:20 PM, ChrisLu wrote:
    I simply use "ab -n 10000 -c 3 http://localhost:8080/".

    This is run comparing a Go helloword with Nginx. I got results like
    3600req/sec vis 5500req/sec on my computer.

    Chris

    On Thursday, October 18, 2012 6:13:04 PM UTC-7, Dave Cheney wrote:

    @Chris, what program are you using to simulate the client ? Are you
    using siege like Justin ?
    On Fri, Oct 19, 2012 at 8:30 AM, Dave Cheney wrote:
    Thanks Chris, sorry I didn't see your other reply till now. One of the
    causes of the high % of CPU spent in futex, I believe, is mutex
    contention.
    I'll keep investigating.


    On 19/10/2012, at 8:23, ChrisLu wrote:

    Please see my reply(2 hours ago) for the load test and profiling with
    your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is
    about
    to block. However, if write(2) does more than copy the buffer into
    kernel
    space and return the number of bytes that fit then this approach
    probably
    isn't going to improve throughput. The best way to do this is to load
    test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and
    let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
    --
  • ChrisLu at Oct 19, 2012 at 6:33 am
    ok. To keep the persistent connections, I can use "ab -n 10000 -c 3 -k
    http://localhost:8080/".

    The performance does improve. However, Nginx improves much more.
    For Go helloworld vs Nginx, I got 5200req/sec vs 11600req/sec.

    Here is the new profile with the persistent connections:

    http://postimage.org/image/c5qq14p6b/full/

    (pprof) top
    Total: 6838 samples
    1895 27.7% 27.7% 1895 27.7% runtime.futex
    1847 27.0% 54.7% 1847 27.0% syscall.RawSyscall
    648 9.5% 64.2% 653 9.5% syscall.Syscall6
    176 2.6% 66.8% 194 2.8% syscall.Syscall
    118 1.7% 68.5% 118 1.7% syscall.RawSyscall6
    114 1.7% 70.2% 114 1.7% runtime.usleep
    98 1.4% 71.6% 98 1.4% runtime.osyield
    67 1.0% 72.6% 67 1.0% runtime.memmove
    61 0.9% 73.5% 306 4.5% runtime.mallocgc
    58 0.8% 74.3% 58 0.8% runtime.memhash
    (pprof) top --cum
    Total: 6838 samples
    2 0.0% 0.0% 5867 85.8% schedunlock
    15 0.2% 0.2% 3663 53.6% net/http.(*conn).serve
    4 0.1% 0.3% 2195 32.1% net.(*pollServer).Run
    8 0.1% 0.4% 2017 29.5% net/http.(*response).finishRequest
    4 0.1% 0.5% 1921 28.1% bufio.(*Writer).Flush
    9 0.1% 0.6% 1918 28.0% net.(*conn).Write
    7 0.1% 0.7% 1914 28.0% net.(*netFD).Write
    1895 27.7% 28.4% 1895 27.7% runtime.futex
    2 0.0% 28.5% 1849 27.0% syscall.WriteNB
    2 0.0% 28.5% 1849 27.0% syscall.writeNB

    Chris
    On Thursday, October 18, 2012 8:23:42 PM UTC-7, Dave Cheney wrote:

    That explains why you are spending so much time in syscall.Close, ab
    uses http/1.0 mode without persistent connections.

    On Fri, Oct 19, 2012 at 2:20 PM, ChrisLu <chri...@gmail.com <javascript:>>
    wrote:
    I simply use "ab -n 10000 -c 3 http://localhost:8080/".

    This is run comparing a Go helloword with Nginx. I got results like
    3600req/sec vis 5500req/sec on my computer.

    Chris

    On Thursday, October 18, 2012 6:13:04 PM UTC-7, Dave Cheney wrote:

    @Chris, what program are you using to simulate the client ? Are you
    using siege like Justin ?
    On Fri, Oct 19, 2012 at 8:30 AM, Dave Cheney wrote:
    Thanks Chris, sorry I didn't see your other reply till now. One of
    the
    causes of the high % of CPU spent in futex, I believe, is mutex
    contention.
    I'll keep investigating.


    On 19/10/2012, at 8:23, ChrisLu wrote:

    Please see my reply(2 hours ago) for the load test and profiling with
    your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB
    variant
    should reduce scheduler overhead by not informing it the goroutine
    is
    about
    to block. However, if write(2) does more than copy the buffer into
    kernel
    space and return the number of bytes that fit then this approach
    probably
    isn't going to improve throughput. The best way to do this is to
    load
    test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and
    let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses
    issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
    --
  • Devon H. O'Dell at Oct 19, 2012 at 7:57 am

    2012/10/19 ChrisLu <chris.lu@gmail.com>:
    ok. To keep the persistent connections, I can use "ab -n 10000 -c 3 -k
    http://localhost:8080/".

    The performance does improve. However, Nginx improves much more.
    For Go helloworld vs Nginx, I got 5200req/sec vs 11600req/sec.

    Here is the new profile with the persistent connections:
    Looks like a bunch of lock contention to me. I really wonder if we
    could get better performance with user space locks in some cases.

    --dho
    http://postimage.org/image/c5qq14p6b/full/

    (pprof) top
    Total: 6838 samples
    1895 27.7% 27.7% 1895 27.7% runtime.futex
    1847 27.0% 54.7% 1847 27.0% syscall.RawSyscall
    648 9.5% 64.2% 653 9.5% syscall.Syscall6
    176 2.6% 66.8% 194 2.8% syscall.Syscall
    118 1.7% 68.5% 118 1.7% syscall.RawSyscall6
    114 1.7% 70.2% 114 1.7% runtime.usleep
    98 1.4% 71.6% 98 1.4% runtime.osyield
    67 1.0% 72.6% 67 1.0% runtime.memmove
    61 0.9% 73.5% 306 4.5% runtime.mallocgc
    58 0.8% 74.3% 58 0.8% runtime.memhash
    (pprof) top --cum
    Total: 6838 samples
    2 0.0% 0.0% 5867 85.8% schedunlock
    15 0.2% 0.2% 3663 53.6% net/http.(*conn).serve
    4 0.1% 0.3% 2195 32.1% net.(*pollServer).Run
    8 0.1% 0.4% 2017 29.5% net/http.(*response).finishRequest
    4 0.1% 0.5% 1921 28.1% bufio.(*Writer).Flush
    9 0.1% 0.6% 1918 28.0% net.(*conn).Write
    7 0.1% 0.7% 1914 28.0% net.(*netFD).Write
    1895 27.7% 28.4% 1895 27.7% runtime.futex
    2 0.0% 28.5% 1849 27.0% syscall.WriteNB
    2 0.0% 28.5% 1849 27.0% syscall.writeNB

    Chris

    On Thursday, October 18, 2012 8:23:42 PM UTC-7, Dave Cheney wrote:

    That explains why you are spending so much time in syscall.Close, ab
    uses http/1.0 mode without persistent connections.
    On Fri, Oct 19, 2012 at 2:20 PM, ChrisLu wrote:
    I simply use "ab -n 10000 -c 3 http://localhost:8080/".

    This is run comparing a Go helloword with Nginx. I got results like
    3600req/sec vis 5500req/sec on my computer.

    Chris

    On Thursday, October 18, 2012 6:13:04 PM UTC-7, Dave Cheney wrote:

    @Chris, what program are you using to simulate the client ? Are you
    using siege like Justin ?
    On Fri, Oct 19, 2012 at 8:30 AM, Dave Cheney wrote:
    Thanks Chris, sorry I didn't see your other reply till now. One of
    the
    causes of the high % of CPU spent in futex, I believe, is mutex
    contention.
    I'll keep investigating.


    On 19/10/2012, at 8:23, ChrisLu wrote:

    Please see my reply(2 hours ago) for the load test and profiling with
    your
    patch.

    Chris
    On Thursday, October 18, 2012 1:46:47 PM UTC-7, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB
    variant
    should reduce scheduler overhead by not informing it the goroutine
    is
    about
    to block. However, if write(2) does more than copy the buffer into
    kernel
    space and return the number of bytes that fit then this approach
    probably
    isn't going to improve throughput. The best way to do this is to
    load
    test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and
    let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses
    issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    --
    --
    --
    --
  • Bryanturley at Oct 24, 2012 at 6:53 pm
    (pkg/runtime/sys_linux_amd64.s)
    // int64 futex(int32 *uaddr, int32 op, int32
    val,

    // struct timespec *timeout, int32 *uaddr2, int32
    val2);

    TEXT runtime·futex(SB),7,$0
    MOVQ 8(SP), DI
    MOVL 16(SP), SI
    MOVL 20(SP), DX
    MOVQ 24(SP), R10
    MOVQ 32(SP), R8
    MOVL 40(SP), R9
    MOVL $202, AX
    SYSCALL
    RET

    A syscall for a mutex does seem overboard, but I am betting it is more
    portable.

    Though it seems to have at some point in time increased the speed of apps
    http://kernel.org/doc/ols/2002/ols2002-pages-479-495.pdf

    You guys talked about trying your own lock code, did that work out for
    you? I am curious.

    --
  • Ian Lance Taylor at Oct 24, 2012 at 8:07 pm

    On Wed, Oct 24, 2012 at 11:53 AM, bryanturley wrote:
    (pkg/runtime/sys_linux_amd64.s)
    // int64 futex(int32 *uaddr, int32 op, int32 val,
    // struct timespec *timeout, int32 *uaddr2, int32 val2);
    TEXT runtime·futex(SB),7,$0
    MOVQ 8(SP), DI
    MOVL 16(SP), SI
    MOVL 20(SP), DX
    MOVQ 24(SP), R10
    MOVQ 32(SP), R8
    MOVL 40(SP), R9
    MOVL $202, AX
    SYSCALL
    RET

    A syscall for a mutex does seem overboard, but I am betting it is more
    portable.

    Though it seems to have at some point in time increased the speed of apps
    http://kernel.org/doc/ols/2002/ols2002-pages-479-495.pdf

    You guys talked about trying your own lock code, did that work out for you?
    I am curious.
    I'm not sure I understand your question, but that is not the lock
    code. That is the code for the futex system call. The lock code for
    a GNU/Linux system is in runtime/lock_futex.c. The lock code uses the
    futex system call to wait for a lock to become available, but, e.g.,
    acquiring an unlocked lock does not make a system call.

    Ian

    --
  • Bryanturley at Oct 24, 2012 at 8:32 pm

    I am curious.
    I'm not sure I understand your question, but that is not the lock
    code. That is the code for the futex system call.

    Yeah I should have read more of the code I did start in lock_futex.c, read
    through it to fast. Mostly noted the massive style difference in go's c
    code to go's normal coding style...
    I code in c almost identically to the default go coding style.
    I was responding to this thread they have a pprof generated image that says
    runtime.futex() is taking 27% a few emails up.

    The lock code for
    a GNU/Linux system is in runtime/lock_futex.c. The lock code uses the
    futex system call to wait for a lock to become available, but, e.g.,
    acquiring an unlocked lock does not make a system call.

    Ian
    Oh that is good to know, never used the futex call.
    Every time (before go) I have needed a mutex I have written my own, but
    they never worked with the kernel scheduler like futex does. Then again
    futex doesn't work on all (any?) non-linux platforms. Should have read the
    lock code more carefully. I wonder if spinning a little longer before
    calling the futex stuff would speed this up.
    All in all I was curious whether they got the go http server faster in this
    case.

    --
  • Bryanturley at Oct 24, 2012 at 8:47 pm
    Ian just to be clear "You guys talked about trying your own lock code, did
    that work out for you? I am curious."
    Was referring to the people on this thread not the go developers as a
    whole. I trust you guys ;)



    --
  • ChrisLu at Oct 24, 2012 at 9:04 pm
    As the original poster, I would say this performance problem still exists.

    Just by looking at the graph, the runtime.futex costs 27.7% time. They seem
    to be all cost for scheduling the goroutines.
    If so, this seems a very high cost just to have the convenience of
    goroutines.

    http://postimage.org/image/c5qq14p6b/full/

    The Go scheduler grows goroutines on an as needed bases one by one, and
    never releases back the idle goroutines.
    Here seems a good opportunity to pool the goroutines more efficiently. Am I
    right?

    Chris
    http://weed-fs.googlecode.com
    On Wednesday, October 24, 2012 1:32:08 PM UTC-7, bryanturley wrote:

    I am curious.

    I'm not sure I understand your question, but that is not the lock
    code. That is the code for the futex system call.

    Yeah I should have read more of the code I did start in lock_futex.c, read
    through it to fast. Mostly noted the massive style difference in go's c
    code to go's normal coding style...
    I code in c almost identically to the default go coding style.
    I was responding to this thread they have a pprof generated image that
    says runtime.futex() is taking 27% a few emails up.

    The lock code for
    a GNU/Linux system is in runtime/lock_futex.c. The lock code uses the
    futex system call to wait for a lock to become available, but, e.g.,
    acquiring an unlocked lock does not make a system call.

    Ian
    Oh that is good to know, never used the futex call.
    Every time (before go) I have needed a mutex I have written my own, but
    they never worked with the kernel scheduler like futex does. Then again
    futex doesn't work on all (any?) non-linux platforms. Should have read the
    lock code more carefully. I wonder if spinning a little longer before
    calling the futex stuff would speed this up.
    All in all I was curious whether they got the go http server faster in
    this case.
    --
  • Bryanturley at Oct 24, 2012 at 9:40 pm

    On Wednesday, October 24, 2012 4:04:32 PM UTC-5, ChrisLu wrote:
    As the original poster, I would say this performance problem still exists.

    Just by looking at the graph, the runtime.futex costs 27.7% time. They
    seem to be all cost for scheduling the goroutines.
    If so, this seems a very high cost just to have the convenience of
    goroutines.

    http://postimage.org/image/c5qq14p6b/full/

    The Go scheduler grows goroutines on an as needed bases one by one, and
    never releases back the idle goroutines.
    You only get new goroutines by using the go keyword. So you or a library
    you are using (http) grow them on an as needed basis the scheduler just
    schedules them. If your goroutine exits it doesn't come back to life at
    any point.
    Though it might be interesting to know how many goroutines are
    alive/created/die during this test.

    I don't think it is the schedulers job to release idle goroutines either.
    If one is idle and it is no longer needed it should be killed by it's
    creator not by a 2nd/3rd party observer.

    Here seems a good opportunity to pool the goroutines more efficiently. Am I
    right?
    I have read (I think) that there is a new scheduler coming soonish that
    might fix this. Though pool is not the word I would use... manage perhaps.
    If you want to get crazy you could try changing the spin counts yourself
    that might make it better, could also make it way way worse.
    I wouldn't recommend it unless you have some experience at that level
    already though.

    --
  • Bryanturley at Oct 24, 2012 at 9:44 pm

    On Wednesday, October 24, 2012 4:40:34 PM UTC-5, bryanturley wrote:

    On Wednesday, October 24, 2012 4:04:32 PM UTC-5, ChrisLu wrote:

    As the original poster, I would say this performance problem still exists.
    It is a mostly synthetic benchmark as well though. If your code did a
    little more work before returning data I bet it would even the numbers out.
    I also wouldn't call it a problem, it is just that nginx is faster for the
    moment.


    --
  • ChrisLu at Oct 25, 2012 at 7:07 am
    Not really just synthetic. Golang is good for system programming, and many
    use cases involve providing web services. For my case, the Weed File System
    project, it is meant to serve static content via http. Even the algorithm
    is fast and efficient, it is embarrassing that static file serving are so
    much slow.

    Nginx is just one web server. There are many other servers much faster too.

    If this is a known slow performance problem, probably we should get warned
    during the Golang "close to the metal" marketing.

    The Go scheduler clearly can be much more efficient. Don't call the
    benchmark synthetic, and let's look at the real issue.

    Chris
    http://weed-fs.googlecode.com
    On Wednesday, October 24, 2012 2:44:28 PM UTC-7, bryanturley wrote:
    On Wednesday, October 24, 2012 4:40:34 PM UTC-5, bryanturley wrote:


    On Wednesday, October 24, 2012 4:04:32 PM UTC-5, ChrisLu wrote:

    As the original poster, I would say this performance problem still
    exists.
    It is a mostly synthetic benchmark as well though. If your code did a
    little more work before returning data I bet it would even the numbers out.
    I also wouldn't call it a problem, it is just that nginx is faster for the
    moment.
    --
  • Dustin at Oct 25, 2012 at 7:18 am

    On Thursday, October 25, 2012 12:07:22 AM UTC-7, ChrisLu wrote:
    Not really just synthetic. Golang is good for system programming, and many
    use cases involve providing web services. For my case, the Weed File System
    project, it is meant to serve static content via http. Even the algorithm
    is fast and efficient, it is embarrassing that static file serving are so
    much slow.

    Nginx is just one web server. There are many other servers much faster too.

    If this is a known slow performance problem, probably we should get warned
    during the Golang "close to the metal" marketing.

    The Go scheduler clearly can be much more efficient. Don't call the
    benchmark synthetic, and let's look at the real issue.
    To be fair, you're saying the web server performance is ~20% slower than
    your custom C++ solution and maybe around half the speed of the fastest
    hand optimized web server you can find. While it's certainly possible that
    it can get faster, I think it's a bit unreasonable to call this
    embarrassing.

    --
  • Joubin Houshyar at Oct 25, 2012 at 2:49 pm

    On Thursday, October 25, 2012 3:18:48 AM UTC-4, Dustin wrote:
    On Thursday, October 25, 2012 12:07:22 AM UTC-7, ChrisLu wrote:

    Not really just synthetic. Golang is good for system programming, and
    many use cases involve providing web services. For my case, the Weed File
    System project, it is meant to serve static content via http. Even the
    algorithm is fast and efficient, it is embarrassing that static file
    serving are so much slow.

    Nginx is just one web server. There are many other servers much faster
    too.

    If this is a known slow performance problem, probably we should get
    warned during the Golang "close to the metal" marketing.

    The Go scheduler clearly can be much more efficient. Don't call the
    benchmark synthetic, and let's look at the real issue.
    To be fair, you're saying the web server performance is ~20% slower than
    your custom C++ solution and maybe around half the speed of the fastest
    hand optimized web server you can find. While it's certainly possible that
    it can get faster, I think it's a bit unreasonable to call this
    embarrassing.
    That is fair, Dustin. But the subtext here is whether there even exists a
    path for a hand-optimized/non-idiomatic WFS. I suggest that it would be
    generally helpful for the community (and also a plus for Go the language)
    to address this type of concern and provide guidance for improving
    performance, and not dismiss them out of hand.

    /R

    --
  • Ethan Burns at Oct 25, 2012 at 3:15 pm

    On Thursday, October 25, 2012 10:49:07 AM UTC-4, Joubin Houshyar wrote:

    That is fair, Dustin. But the subtext here is whether there even exists a
    path for a hand-optimized/non-idiomatic WFS. I suggest that it would be
    generally helpful for the community (and also a plus for Go the language)
    to address this type of concern and provide guidance for improving
    performance, and not dismiss them out of hand.

    I don't think that anything has been dismissed. From what I understand,
    this has been classified as issue 2933
    (http://code.google.com/p/go/issues/detail?id=2933) which is marked as
    Go1.1 maybe (on the TODO list, but not at the top of it). There are many
    other things that need to be fixed before Go 1.1 is ready
    (http://swtch.com/~rsc/go11.html), so I am sure that everyone is quite busy.


    Best,
    Ethan

    --
  • Joubin Houshyar at Oct 25, 2012 at 4:08 pm

    On Thursday, October 25, 2012 11:15:24 AM UTC-4, Ethan Burns wrote:
    On Thursday, October 25, 2012 10:49:07 AM UTC-4, Joubin Houshyar wrote:

    That is fair, Dustin. But the subtext here is whether there even exists a
    path for a hand-optimized/non-idiomatic WFS. I suggest that it would be
    generally helpful for the community (and also a plus for Go the language)
    to address this type of concern and provide guidance for improving
    performance, and not dismiss them out of hand.

    I don't think that anything has been dismissed. From what I understand,
    this has been classified as issue 2933 (
    http://code.google.com/p/go/issues/detail?id=2933) which is marked as
    Go1.1 maybe (on the TODO list, but not at the top of it). There are many
    other things that need to be fixed before Go 1.1 is ready (
    http://swtch.com/~rsc/go11.html), so I am sure that everyone is quite
    busy.
    Just in case it is not clear: Comment was a general suggestion and
    certainly not directed at Dustin, and, imo it is /perfectly reasonable/
    that version 1.0 of Go runtime (or any platform for that matter) is not
    matching the performance of stacks that have been under development for
    years. Remember this?
    http://openmap.bbn.com/~kanderso/performance/java/index.html

    Best,
    Ethan
    /R

    --
  • Dave Cheney at Oct 28, 2012 at 5:09 pm
    http://codereview.appspot.com/6813046/

    Could those with suitable test harnesses please comment with benchmark
    numbers if this change produces an improvement.
    On Fri, Oct 26, 2012 at 3:08 AM, Joubin Houshyar wrote:

    On Thursday, October 25, 2012 11:15:24 AM UTC-4, Ethan Burns wrote:
    On Thursday, October 25, 2012 10:49:07 AM UTC-4, Joubin Houshyar wrote:

    That is fair, Dustin. But the subtext here is whether there even exists a
    path for a hand-optimized/non-idiomatic WFS. I suggest that it would be
    generally helpful for the community (and also a plus for Go the language) to
    address this type of concern and provide guidance for improving performance,
    and not dismiss them out of hand.

    I don't think that anything has been dismissed. From what I understand,
    this has been classified as issue 2933
    (http://code.google.com/p/go/issues/detail?id=2933) which is marked as Go1.1
    maybe (on the TODO list, but not at the top of it). There are many other
    things that need to be fixed before Go 1.1 is ready
    (http://swtch.com/~rsc/go11.html), so I am sure that everyone is quite busy.

    Just in case it is not clear: Comment was a general suggestion and
    certainly not directed at Dustin, and, imo it is /perfectly reasonable/ that
    version 1.0 of Go runtime (or any platform for that matter) is not matching
    the performance of stacks that have been under development for years.
    Remember this? http://openmap.bbn.com/~kanderso/performance/java/index.html

    Best,
    Ethan

    /R

    --
    --
  • Rob Pike at Oct 28, 2012 at 5:13 pm
    I don't like the change in semantics here. A blocking operation has
    silently become non-blocking.

    -rob

    --
  • Job van der Zwan at Oct 28, 2012 at 6:02 pm
    Uhm... the function is called "WriteNB", isn't that pretty explicit?
    On Sunday, 28 October 2012 18:13:27 UTC+1, Rob Pike wrote:

    I don't like the change in semantics here. A blocking operation has
    silently become non-blocking.

    -rob
    --
  • Joubin Houshyar at Oct 29, 2012 at 1:13 am

    On Sun, Oct 28, 2012 at 1:55 PM, Job van der Zwan wrote:

    Uhm... the function is called "WriteNB", isn't that pretty explicit?

    He means http://golang.org/pkg/io/#Writer


    On Sunday, 28 October 2012 18:13:27 UTC+1, Rob Pike wrote:

    I don't like the change in semantics here. A blocking operation has
    silently become non-blocking.

    -rob
    --
  • David Anderson at Oct 29, 2012 at 1:11 am
    Check the comments on the code review. That concern is resolved, so
    assuming other things also get resolved, we should get a decent network
    performance boost. Yay.

    - Dave
    On Sun, Oct 28, 2012 at 6:06 PM, Joubin Houshyar wrote:



    On Sun, Oct 28, 2012 at 1:55 PM, Job van der Zwan <
    j.l.vanderzwan@gmail.com> wrote:
    Uhm... the function is called "WriteNB", isn't that pretty explicit?

    He means http://golang.org/pkg/io/#Writer


    On Sunday, 28 October 2012 18:13:27 UTC+1, Rob Pike wrote:

    I don't like the change in semantics here. A blocking operation has
    silently become non-blocking.

    -rob
    --

    --
  • Dave Cheney at Nov 21, 2012 at 5:33 am
    Hello,

    Some new results are available

    https://codereview.appspot.com/6813046/#msg24

    I am interested too see if others can verify or contradict my results.

    Cheers

    Dave
    On Mon, Oct 29, 2012 at 12:11 PM, David Anderson wrote:
    Check the comments on the code review. That concern is resolved, so assuming
    other things also get resolved, we should get a decent network performance
    boost. Yay.

    - Dave
    On Sun, Oct 28, 2012 at 6:06 PM, Joubin Houshyar wrote:



    On Sun, Oct 28, 2012 at 1:55 PM, Job van der Zwan
    wrote:
    Uhm... the function is called "WriteNB", isn't that pretty explicit?

    He means http://golang.org/pkg/io/#Writer


    On Sunday, 28 October 2012 18:13:27 UTC+1, Rob Pike wrote:

    I don't like the change in semantics here. A blocking operation has
    silently become non-blocking.

    -rob

    --
    --
  • ChrisLu at Oct 24, 2012 at 9:41 pm
    Could be a similar issue to
    http://code.google.com/p/go/issues/detail?id=2933

    However it is marked as "Go 1.1 maybe". Hope it will not take forever to
    fix.

    Chris
    On Wednesday, October 24, 2012 2:04:32 PM UTC-7, ChrisLu wrote:

    As the original poster, I would say this performance problem still exists.

    Just by looking at the graph, the runtime.futex costs 27.7% time. They
    seem to be all cost for scheduling the goroutines.
    If so, this seems a very high cost just to have the convenience of
    goroutines.

    http://postimage.org/image/c5qq14p6b/full/

    The Go scheduler grows goroutines on an as needed bases one by one, and
    never releases back the idle goroutines.
    Here seems a good opportunity to pool the goroutines more efficiently. Am
    I right?

    Chris
    http://weed-fs.googlecode.com
    On Wednesday, October 24, 2012 1:32:08 PM UTC-7, bryanturley wrote:

    I am curious.

    I'm not sure I understand your question, but that is not the lock
    code. That is the code for the futex system call.

    Yeah I should have read more of the code I did start in lock_futex.c,
    read through it to fast. Mostly noted the massive style difference in go's
    c code to go's normal coding style...
    I code in c almost identically to the default go coding style.
    I was responding to this thread they have a pprof generated image that
    says runtime.futex() is taking 27% a few emails up.

    The lock code for
    a GNU/Linux system is in runtime/lock_futex.c. The lock code uses the
    futex system call to wait for a lock to become available, but, e.g.,
    acquiring an unlocked lock does not make a system call.

    Ian
    Oh that is good to know, never used the futex call.
    Every time (before go) I have needed a mutex I have written my own, but
    they never worked with the kernel scheduler like futex does. Then again
    futex doesn't work on all (any?) non-linux platforms. Should have read the
    lock code more carefully. I wonder if spinning a little longer before
    calling the futex stuff would speed this up.
    All in all I was curious whether they got the go http server faster in
    this case.
    --
  • Jli Justinli at Oct 19, 2012 at 2:29 am
    Just benchmarked the new code (also switched to the current version in the
    repository rather than the release version). Here's the pprof
    output: http://dl.dropbox.com/u/11537896/pprof7401.0.svg It looks like the
    time in syscall is still quite high.
    The transaction rate is up somewhat though, I'm hitting around 4400
    trans/sec consistently.
    On Thursday, October 18, 2012 4:46:47 PM UTC-4, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is about
    to block. However, if write(2) does more than copy the buffer into kernel
    space and return the number of bytes that fit then this approach probably
    isn't going to improve throughput. The best way to do this is to load test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com <javascript:> wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris


    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney <da...@cheney.net>
    wrote:
    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the
    original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly
    justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --


    --
  • Dave Cheney at Oct 19, 2012 at 3:27 am
    Can you please try again, i've removed a few allocations in the Accept() path.

    hg revert @6739043
    hg clpach 6739043

    then ./make.bash

    will be sufficient.

    The blocking on syscall.Accept and syscall.Close are probably
    unavoidable, they are blocking syscalls, we we have to inform the
    scheduler so it can park the goroutine. syscall.Accept may be fixable,
    I'm pretty sure Close is not.
    On Fri, Oct 19, 2012 at 1:25 PM, wrote:
    Just benchmarked the new code (also switched to the current version in the
    repository rather than the release version). Here's the pprof output:
    http://dl.dropbox.com/u/11537896/pprof7401.0.svg It looks like the time in
    syscall is still quite high.
    The transaction rate is up somewhat though, I'm hitting around 4400
    trans/sec consistently.
    On Thursday, October 18, 2012 4:46:47 PM UTC-4, Dave Cheney wrote:

    Well, this idea still needs validation. In theory using the NB variant
    should reduce scheduler overhead by not informing it the goroutine is about
    to block. However, if write(2) does more than copy the buffer into kernel
    space and return the number of bytes that fit then this approach probably
    isn't going to improve throughput. The best way to do this is to load test
    and profile.

    On 19/10/2012, at 3:27, jli.ju...@gmail.com wrote:

    Awesome, this looks really good! Once I get home I'll try it out and let
    you know how it goes.
    Thanks for taking this on!

    - Justin
    On Thursday, October 18, 2012 6:53:24 AM UTC-4, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the
    original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly
    justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
    --
    --
  • ChrisLu at Oct 18, 2012 at 8:31 pm
    Here is the profiling for GOMAXPROCS=1 and GOMAXPROCS=#ofCPUs,
    respectively, with the profiling graph.

    The previous heavy usage of "syscall.Syscall" seems much less now and I
    consider it a very good change. However, the overall performance still stay
    almost the same though. The bottleneck seems on the "runtime.futex" now.

    1) GOMAXPROCS=1

    http://postimage.org/image/rvl8yckgh/

    (pprof) top
    Total: 3216 samples
    1406 43.7% 43.7% 1406 43.7% runtime.futex
    1216 37.8% 81.5% 1216 37.8% syscall.RawSyscall
    324 10.1% 91.6% 327 10.2% syscall.Syscall
    152 4.7% 96.3% 152 4.7% bytes.IndexByte
    30 0.9% 97.3% 30 0.9% scanblock
    6 0.2% 97.5% 7 0.2% sweepspan
    5 0.2% 97.6% 6 0.2% syscall.Syscall6
    3 0.1% 97.7% 4 0.1% MCentral_Alloc
    3 0.1% 97.8% 7 0.2%
    net/textproto.(*Reader).ReadMIMEHeader
    3 0.1% 97.9% 9 0.3% runtime.MCache_Alloc
    (pprof) top --cum
    Total: 3216 samples
    2 0.1% 0.1% 3197 99.4% schedunlock
    1 0.0% 0.1% 2331 72.5% net/http.(*conn).serve
    1406 43.7% 43.8% 1406 43.7% runtime.futex
    1 0.0% 43.8% 1401 43.6% runtime.entersyscall
    0 0.0% 43.8% 1400 43.5% type..eq.[32]string
    0 0.0% 43.8% 1398 43.5% runtime.futexwakeup
    0 0.0% 43.8% 1398 43.5% runtime.notewakeup
    0 0.0% 43.8% 1217 37.8% bufio.(*Writer).Flush
    0 0.0% 43.8% 1217 37.8% net.(*conn).Write
    0 0.0% 43.8% 1217 37.8% net.(*netFD).Write

    2) GOMAXPROCS = number of CPUs
    http://postimage.org/image/erx51udrb/full/

    (pprof) top
    Total: 4550 samples
    1351 29.7% 29.7% 1351 29.7% runtime.futex
    1181 26.0% 55.6% 1181 26.0% syscall.RawSyscall
    588 12.9% 68.6% 588 12.9% runtime.usleep
    439 9.6% 78.2% 443 9.7% syscall.Syscall
    320 7.0% 85.3% 320 7.0% syscall.Syscall6
    39 0.9% 86.1% 84 1.8% sweepspan
    37 0.8% 86.9% 37 0.8% syscall.RawSyscall6
    30 0.7% 87.6% 30 0.7% bytes.IndexByte
    27 0.6% 88.2% 179 3.9% scanblock
    25 0.5% 88.7% 25 0.5% runtime.memmove
    (pprof) top --cum
    Total: 4550 samples
    0 0.0% 0.0% 3518 77.3% schedunlock
    5 0.1% 0.1% 1910 42.0% net/http.(*conn).serve
    1351 29.7% 29.8% 1351 29.7% runtime.futex
    0 0.0% 29.8% 1188 26.1% net/http.(*response).finishRequest
    0 0.0% 29.8% 1187 26.1% bufio.(*Writer).Flush
    3 0.1% 29.9% 1187 26.1% net.(*conn).Write
    1 0.0% 29.9% 1184 26.0% net.(*netFD).Write
    1181 26.0% 55.8% 1181 26.0% syscall.RawSyscall
    2 0.0% 55.9% 1180 25.9% syscall.WriteNB
    0 0.0% 55.9% 1178 25.9% syscall.writeNB

    On Thursday, October 18, 2012 3:53:24 AM UTC-7, Dave Cheney wrote:

    Chris, Justin,

    If you are able, could you try this CL which partially addresses issue
    3412.

    http://codereview.appspot.com/6739043

    Benchmark results and profile svgs would be great as I don't have a
    test harness that can generate enough load.

    Cheers

    Dave
    On Tue, Oct 16, 2012 at 3:44 PM, Dave Cheney wrote:
    I think the best solution is to resolve
    https://code.google.com/p/go/issues/detail?id=3412, this will reduce
    the amount of scheduler thrashing.
    On Tue, Oct 16, 2012 at 2:11 PM, Chris Lu wrote:
    Actually I tried several approches, but setting/unsetting GOMAXPROCS,
    content-length. But all got similar results.

    This graph is without GOMAXPROCS setting.

    http://postimage.org/image/aurx4vvmn/full/

    With the code, you should be able to get similar results easily.

    Chris

    On Mon, Oct 15, 2012 at 7:30 PM, Dave Cheney wrote:

    Do you have a graph with GOMAXPROCS unset ?

    Related: resolving https://code.google.com/p/go/issues/detail?id=3412
    may reduce the amount of time spent in Write by avoiding a scheduler
    call.
    On Tue, Oct 16, 2012 at 1:20 PM, ChrisLu wrote:
    The code should be just basic "Hello World", as seen in the original
    post.

    I also profiled the execution graph here:

    http://postimage.org/image/aurx4vvmn/full/

    Chris
    On Monday, October 15, 2012 3:52:46 PM UTC-7, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify
    it
    when
    it's slower than the previous C++ thread-per-connection one I'm
    using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Jli Justinli at Oct 16, 2012 at 3:02 am
    I accidentally posted my previous message without finishing, I meant to say
    "Any more information about this would be much appreciated."

    Anyways, here's the code I'm using. I have a custom handler because this is
    stripped out of a bigger application.

    ////////////////////////////////////
    package main

    import (
    "runtime"
    "runtime/pprof"
    "os"
    "os/signal"
    "net/http"
    "strconv"
    "io"
    "fmt"
    )

    type httpHandler struct{}
    func (handler *httpHandler) ServeHTTP(w http.ResponseWriter, r
    *http.Request) {
    response := "Test content."
    w.Header().Add("Content-Type", "text/plain")
    w.Header().Add("Content-Length", strconv.Itoa(len(response)))
    io.WriteString(w, response)
    }

    func main() {
    runtime.GOMAXPROCS(1)//runtime.NumCPU())

    profile := true

    if profile {
    f, _ := os.Create("profile.cpu")
    pprof.StartCPUProfile(f)
    }

    go func() {
    c := make(chan os.Signal, 1)
    signal.Notify(c, os.Interrupt)
    <-c

    if profile {
    pprof.StopCPUProfile()
    }

    fmt.Println("Caught interrupt.. shutting down.")
    os.Exit(0)
    }()

    handler := &httpHandler{}
    server := &http.Server{
    Addr: ":8080",
    Handler: handler,
    }
    server.ListenAndServe()
    }
    ////////////////////////////////////

    And here's the pprof
    output: https://dl.dropbox.com/u/11537896/pprof11533.0.svg

    Any help optimizing this would be awesome!

    On Monday, October 15, 2012 6:52:46 PM UTC-4, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it when
    it's slower than the previous C++ thread-per-connection one I'm using. Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
  • Dave Cheney at Oct 16, 2012 at 3:04 am
    Thank you for posting your code. Please describe your benchmark harness.
    On Tue, Oct 16, 2012 at 2:02 PM, wrote:
    I accidentally posted my previous message without finishing, I meant to say
    "Any more information about this would be much appreciated."

    Anyways, here's the code I'm using. I have a custom handler because this is
    stripped out of a bigger application.

    ////////////////////////////////////
    package main

    import (
    "runtime"
    "runtime/pprof"
    "os"
    "os/signal"
    "net/http"
    "strconv"
    "io"
    "fmt"
    )

    type httpHandler struct{}
    func (handler *httpHandler) ServeHTTP(w http.ResponseWriter, r
    *http.Request) {
    response := "Test content."
    w.Header().Add("Content-Type", "text/plain")
    w.Header().Add("Content-Length", strconv.Itoa(len(response)))
    io.WriteString(w, response)
    }

    func main() {
    runtime.GOMAXPROCS(1)//runtime.NumCPU())

    profile := true

    if profile {
    f, _ := os.Create("profile.cpu")
    pprof.StartCPUProfile(f)
    }

    go func() {
    c := make(chan os.Signal, 1)
    signal.Notify(c, os.Interrupt)
    <-c

    if profile {
    pprof.StopCPUProfile()
    }

    fmt.Println("Caught interrupt.. shutting down.")
    os.Exit(0)
    }()

    handler := &httpHandler{}
    server := &http.Server{
    Addr: ":8080",
    Handler: handler,
    }
    server.ListenAndServe()
    }
    ////////////////////////////////////

    And here's the pprof output:
    https://dl.dropbox.com/u/11537896/pprof11533.0.svg

    Any help optimizing this would be awesome!

    On Monday, October 15, 2012 6:52:46 PM UTC-4, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it
    when
    it's slower than the previous C++ thread-per-connection one I'm using.
    Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --
    --
  • Jli Justinli at Oct 16, 2012 at 3:34 am
    Also, forgot to mention that siege -c1000 -t10s hovers at ~4000 req/sec for
    this (~4300 with GOMAXPROCS = 8 on this 8 core system), whereas the C++
    server I'm comparing it to gets ~5500 req/sec.
    On Monday, October 15, 2012 11:02:25 PM UTC-4, jli.ju...@gmail.com wrote:

    I accidentally posted my previous message without finishing, I meant to
    say "Any more information about this would be much appreciated."

    Anyways, here's the code I'm using. I have a custom handler because this
    is stripped out of a bigger application.

    ////////////////////////////////////
    package main

    import (
    "runtime"
    "runtime/pprof"
    "os"
    "os/signal"
    "net/http"
    "strconv"
    "io"
    "fmt"
    )

    type httpHandler struct{}
    func (handler *httpHandler) ServeHTTP(w http.ResponseWriter, r
    *http.Request) {
    response := "Test content."
    w.Header().Add("Content-Type", "text/plain")
    w.Header().Add("Content-Length", strconv.Itoa(len(response)))
    io.WriteString(w, response)
    }

    func main() {
    runtime.GOMAXPROCS(1)//runtime.NumCPU())

    profile := true

    if profile {
    f, _ := os.Create("profile.cpu")
    pprof.StartCPUProfile(f)
    }

    go func() {
    c := make(chan os.Signal, 1)
    signal.Notify(c, os.Interrupt)
    <-c

    if profile {
    pprof.StopCPUProfile()
    }

    fmt.Println("Caught interrupt.. shutting down.")
    os.Exit(0)
    }()

    handler := &httpHandler{}
    server := &http.Server{
    Addr: ":8080",
    Handler: handler,
    }
    server.ListenAndServe()
    }
    ////////////////////////////////////

    And here's the pprof output:
    https://dl.dropbox.com/u/11537896/pprof11533.0.svg

    Any help optimizing this would be awesome!

    On Monday, October 15, 2012 6:52:46 PM UTC-4, Dave Cheney wrote:

    I'm getting the same result here.. I'm trying to build a
    high-performance
    web server for a particular application, but I can hardly justify it when
    it's slower than the previous C++ thread-per-connection one I'm using. Any
    more
    That is very concerning. Please post your test code so others can
    attempt to reproduce your results.

    Dave
    --

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupgolang-nuts @
categoriesgo
postedOct 13, '12 at 11:09a
activeNov 21, '12 at 5:33a
posts48
users14
websitegolang.org

People

Translate

site design / logo © 2022 Grokbase