Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

service crash due to concurrent map operations #80

Closed
yb01 opened this issue Jul 13, 2022 · 1 comment
Closed

service crash due to concurrent map operations #80

yb01 opened this issue Jul 13, 2022 · 1 comment

Comments

@yb01
Copy link
Collaborator

yb01 commented Jul 13, 2022

Code is highlighted in bold.

func (e *NodeEvent) SetCheckpoint(checkpoint metrics.ResourceManagementCheckpoint) {
	if !metrics.ResourceManagementMeasurement_Enabled {
		return
	}
	if e.checkpoints == nil {
		e.checkpoints = make(map[metrics.ResourceManagementCheckpoint]time.Time, 5)
	}
	if _, isOK := e.checkpoints[checkpoint]; !isOK {
		e.checkpoints[checkpoint] = time.Now().UTC()
	} else {
		klog.Errorf("Checkpoint %v already set for event %s, node id %s, rv %s", checkpoint, e.Type, e.Node.Id, e.Node.ResourceVersion)
	}
}

for {
		select {
		case <-done:
			return
		case record, ok := <-watchCh:
			if !ok {
				// End of results.
				klog.V(3).Infof("End of results")
				return
			}

			klog.V(9).Infof("Getting event from distributor: %v, %v", *record, *record.Node)

			if err := json.NewEncoder(resp).Encode(*record); err != nil {
				klog.V(3).Infof("encoding record failed. error %v", err)
				resp.WriteHeader(http.StatusInternalServerError)
				return
			}
			record.SetCheckpoint(metrics.Serializer_Encoded)
			if len(watchCh) == 0 {
				flusher.Flush()
			}
			**record.SetCheckpoint(metrics.Serializer_Sent)**
			event.AddLatencyMetricsAllCheckpoints(record)
		}
	}
fatal error: I0712 17:17:25.124856   39554 eventqueue.go:188] Event with node id cfc54fdf-fa5a-4a1c-9c84-4c31147df1e8 sent
concurrent map read and map write

goroutine 17689 [running]:
runtime.throw({0x1339b0e, 0xc00ca515a8})
	/usr/local/go/src/runtime/panic.go:1198 +0x71 fp=0xc00ca51548 sp=0xc00ca51518 pc=0x10342d1
runtime.mapaccess2_faststr(0x1075eaf, 0x15983e0, {0x1330eb6, 0xb})
	/usr/local/go/src/runtime/map_faststr.go:116 +0x3d4 fp=0xc00ca515b0 sp=0xc00ca51548 pc=0x1013b14
global-resource-service/resource-management/pkg/common-lib/types/event.(*NodeEvent).SetCheckpoint(0xc00b24a800, {0x1330eb6, 0xb})
	/Users/yunwenbai/work5/src/global-resource-service/resource-management/pkg/common-lib/types/event/event.go:42 +0xb3 fp=0xc00ca51660 sp=0xc00ca515b0 pc=0x10ec2d3
global-resource-service/resource-management/pkg/service-api/endpoints.(*Installer).serverWatch(0xc000112230, {0x139e1d0, 0xc00a0fa000}, 0xc00b0d0300, {0xc00a01200f, 0x2b})
	/Users/yunwenbai/work5/src/global-resource-service/resource-management/pkg/service-api/endpoints/installer.go:230 +0xb90 fp=0xc00ca51848 sp=0xc00ca51660 pc=0x1257190
global-resource-service/resource-management/pkg/service-api/endpoints.(*Installer).ResourceHandler(0xc000112230, {0x139e1d0, 0xc00a0fa000}, 0xc00b0d0300)
	/Users/yunwenbai/work5/src/global-resource-service/resource-management/pkg/service-api/endpoints/installer.go:149 +0x6f8 fp=0xc00ca51940 sp=0xc00ca51848 pc=0x1256578
global-resource-service/resource-management/pkg/service-api/endpoints.(*Installer).ResourceHandler-fm({0x139e1d0, 0xc00a0fa000}, 0xc00c4322d0)
	/Users/yunwenbai/work5/src/global-resource-service/resource-management/pkg/service-api/endpoints/installer.go:110 +0x3c fp=0xc00ca51970 sp=0xc00ca51940 pc=0x128f5bc
net/http.HandlerFunc.ServeHTTP(0xc00b0d0200, {0x139e1d0, 0xc00a0fa000}, 0x1598bc0)
	/usr/local/go/src/net/http/server.go:2047 +0x2f fp=0xc00ca51998 sp=0xc00ca51970 pc=0x122674f
github.com/gorilla/mux.(*Router).ServeHTTP(0xc00675e000, {0x139e1d0, 0xc00a0fa000}, 0xc00b0d0000)
	/Users/yunwenbai/go/pkg/mod/github.com/gorilla/[email protected]/mux.go:210 +0x1cf fp=0xc00ca51ac0 sp=0xc00ca51998 pc=0x128b86f
net/http.serverHandler.ServeHTTP({0x139d3f8}, {0x139e1d0, 0xc00a0fa000}, 0xc00b0d0000)
	/usr/local/go/src/net/http/server.go:2879 +0x43b fp=0xc00ca51b80 sp=0xc00ca51ac0 pc=0x1228c1b
net/http.(*conn).serve(0xc00a0986e0, {0x139f580, 0xc00673d2f0})
	/usr/local/go/src/net/http/server.go:1930 +0xb08 fp=0xc00ca51fb8 sp=0xc00ca51b80 pc=0x1225aa8
net/http.(*Server).Serve·dwrap·87()
	/usr/local/go/src/net/http/server.go:3034 +0x2e fp=0xc00ca51fe0 sp=0xc00ca51fb8 pc=0x122956e
runtime.goexit()
	/usr/local/go/src/runtime/asm_amd64.s:1581 +0x1 fp=0xc00ca51fe8 sp=0xc00ca51fe0 pc=0x1063c41
created by net/http.(*Server).Serve
	/usr/local/go/src/net/http/server.go:3034 +0x4e8

@yb01 yb01 changed the title service crash after flush() then access the record service crash due to concurrent map operations Jul 13, 2022
yb01 referenced this issue in yb01/global-resource-service-release0.1 Jul 13, 2022
@yb01 yb01 mentioned this issue Jul 13, 2022
yb01 added a commit that referenced this issue Jul 14, 2022
* periodically dump event metrics

* fix issue #79 and #80

* fix typo
@Sindica
Copy link
Collaborator

Sindica commented Jul 21, 2022

Resolving per fix in PR #93

@Sindica Sindica closed this as completed Jul 21, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants