-
Notifications
You must be signed in to change notification settings - Fork 28
[Feature] Implement DMA support #293
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 15 commits
308a213
90d9eef
5e615e1
30bdc36
e6c0b3b
c3f3dc4
046c860
4359f1f
aff3a8a
b2e41e8
5fc388c
70ae3da
fc589c5
17d282c
4b98641
e69c3de
82ac18d
1a1172b
94cc7d0
65fb185
d2d7e7d
28b263b
3af7d8b
326167d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,3 +2,4 @@ build | |
| __pycache__ | ||
| .hypothesis | ||
| .vscode | ||
| *.log | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,184 @@ | ||
| """ | ||
| ========================================================================= | ||
| CgraDmaRTL.py | ||
| ========================================================================= | ||
|
|
||
| Wrapper that composes a CGRA template with a DMA engine attached to the | ||
| CGRA data SPM. | ||
| """ | ||
|
|
||
| from pymtl3 import * | ||
|
|
||
| from .CgraTemplateRTL import CgraTemplateRTL | ||
| from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL | ||
| from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL | ||
| from ..lib.messages import * | ||
| from ..lib.util.data_struct_attr import * | ||
| from ..mem.dma.DmaEngineRTL import DmaEngineRTL | ||
|
|
||
|
|
||
| class CgraDmaRTL( Component ): | ||
| """ | ||
| CgraDmaRTL is a top-level wrapper that integrates a CGRA instance with a | ||
| DMA engine. | ||
|
|
||
| Architectural Design: | ||
| - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a | ||
| DMA engine (`DmaEngineRTL`). | ||
| - CPU control packets are passed through to the CGRA's controller. | ||
| DMA commands are decoded there. | ||
| - The DMA engine accesses the CGRA's internal data SPM through controller- | ||
| forwarded ports; it is not connected directly to `DataMemControllerRTL`. | ||
| - External memory requests from the DMA engine are exposed at the top level | ||
| to be connected to a DRAM model or an AXI adapter. | ||
| - Boundary data ports for multi-CGRA configurations are also passed through | ||
| if enabled. | ||
| """ | ||
|
|
||
| def construct(s, CgraPayloadType, | ||
| multi_cgra_rows, | ||
| multi_cgra_columns, | ||
| per_cgra_rows, per_cgra_columns, | ||
| ctrl_mem_size, data_mem_size_global, | ||
| data_mem_size_per_bank, num_banks_per_cgra, | ||
| num_registers_per_reg_bank, num_ctrl, | ||
| total_steps, mem_access_is_combinational, | ||
| FunctionUnit, FuList, TileList, LinkList, | ||
| dataSPM, controller2addr_map, idTo2d_map, | ||
| is_multi_cgra = True, cgra_id = 0, | ||
| # For heterogeneous multi-cgra support.(maybe remove it in CgraDmaRTL for simplicity?) | ||
| provided_max_per_cgra_rows = None, | ||
| provided_max_per_cgra_cols = None, | ||
| provided_max_num_rd_tiles = None, | ||
| provided_max_num_wr_tiles = None): | ||
|
|
||
| DataType = CgraPayloadType.get_field_type(kAttrData) | ||
| data_bitwidth = DataType.get_field_type(kAttrPayload).nbits | ||
| assert data_bitwidth == 32 | ||
|
|
||
| max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows | ||
| max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns | ||
| max_num_tiles = max_per_cgra_rows * max_per_cgra_cols | ||
| max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts() | ||
|
|
||
| CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows, | ||
| max_num_tiles, CgraPayloadType) | ||
| NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows, | ||
| max_num_tiles, max_num_rd_tiles, | ||
| CgraPayloadType) | ||
|
|
||
| CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) | ||
| DataAddrType = mk_bits(clog2(data_mem_size_global)) | ||
| DmaDramAddrType = mk_bits(64) | ||
| DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM | ||
| DmaMemMaskType = mk_bits(16) | ||
|
|
||
| # Existing CGRA-facing interfaces. | ||
| # CGRA <-> CPU | ||
| s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType) | ||
| s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType) | ||
|
|
||
| if is_multi_cgra: | ||
| s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) | ||
| s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) | ||
|
|
||
| s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] | ||
| s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] | ||
| s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] | ||
| s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] | ||
| s.recv_data_on_boundary_west = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] | ||
| s.send_data_on_boundary_west = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] | ||
| s.recv_data_on_boundary_east = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] | ||
| s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] | ||
|
|
||
| s.cgra_id = InPort(CgraIdType) | ||
| # The local address range of current CGRA. | ||
| # Any address out of this range will be assumed as remote address. | ||
| s.address_lower = InPort(DataAddrType) | ||
| s.address_upper = InPort(DataAddrType) | ||
|
|
||
| # Abstract external dram memory interfaces for the internal DMA engine. | ||
|
|
||
| s.dram_rd_req = SendIfcRTL(DmaDramAddrType) | ||
| s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. name this as |
||
|
|
||
| s.dram_wr_req_val = OutPort() | ||
| s.dram_wr_req_rdy = InPort() | ||
| s.dram_wr_req_addr = OutPort(DmaDramAddrType) | ||
| s.dram_wr_req_data = OutPort(DmaMemDataType) | ||
| s.dram_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM | ||
|
|
||
| s.dram_wr_resp_val = InPort() | ||
| s.dram_wr_resp_rdy = OutPort() | ||
|
|
||
| # Components. | ||
|
|
||
| s.cgra = CgraTemplateRTL(CgraPayloadType, | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was actually wondering why we need this
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I initially designed the DMA module first. To integrate the CGRA with the DMA, I wrote a wrapper module(CgraDmaRTL) that instantiates and connects both the CGRA and the DMA. I feel that directly instantiating and connecting a DMA inside CgraTemplateRTL would make the CGRA bloated. After all, many use cases don't require a DMA at al, and it doesn't align well with hierarchical modular design. I thought the CGRA and DMA should be separate modules, and a higher-level module should instantiate and connect them. WDYT?
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. Then let's rename this file/module to And what about multi-cgra then? You will have another |
||
| multi_cgra_rows, | ||
| multi_cgra_columns, | ||
| per_cgra_rows, per_cgra_columns, | ||
| ctrl_mem_size, data_mem_size_global, | ||
| data_mem_size_per_bank, num_banks_per_cgra, | ||
| num_registers_per_reg_bank, num_ctrl, | ||
| total_steps, mem_access_is_combinational, | ||
| FunctionUnit, FuList, TileList, LinkList, | ||
| dataSPM, controller2addr_map, idTo2d_map, | ||
| is_multi_cgra, cgra_id, | ||
| provided_max_per_cgra_rows, | ||
| provided_max_per_cgra_cols, | ||
| provided_max_num_rd_tiles, | ||
| provided_max_num_wr_tiles, | ||
| has_dma_ports = True) | ||
|
|
||
| s.dma = DmaEngineRTL(spm_data_nbits = data_bitwidth, | ||
| spm_addr_nbits = clog2(data_mem_size_global)) | ||
|
|
||
| # CGRA passthrough connections. | ||
|
|
||
| s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt | ||
| s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt | ||
|
|
||
| if is_multi_cgra: | ||
| s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc | ||
| s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc | ||
|
|
||
| for i in range(max_per_cgra_cols): | ||
| s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i] | ||
| s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i] | ||
| s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i] | ||
| s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i] | ||
|
|
||
| for i in range(max_per_cgra_rows): | ||
| s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i] | ||
| s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i] | ||
| s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i] | ||
| s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i] | ||
|
|
||
| s.cgra_id //= s.cgra.cgra_id | ||
| s.address_lower //= s.cgra.address_lower | ||
| s.address_upper //= s.cgra.address_upper | ||
|
|
||
|
|
||
| # Connections between CGRA and DMA engine. | ||
| # CGRA communicates with DMA engine through the controller. | ||
| s.cgra.dma_cmd //= s.dma.dma_cmd | ||
| s.dma.dma_done //= s.cgra.dma_done | ||
|
|
||
| s.dram_rd_req //= s.dma.dram_rd_req | ||
| s.dram_rd_resp //= s.dma.dram_rd_resp | ||
|
|
||
| s.dram_wr_req_val //= s.dma.dram_wr_req_val | ||
| s.dram_wr_req_rdy //= s.dma.dram_wr_req_rdy | ||
| s.dram_wr_req_addr //= s.dma.dram_wr_req_addr | ||
| s.dram_wr_req_data //= s.dma.dram_wr_req_data | ||
| s.dram_wr_req_mask //= s.dma.dram_wr_req_mask | ||
|
|
||
| s.dram_wr_resp_val //= s.dma.dram_wr_resp_val | ||
| s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy | ||
|
|
||
| # DMA to controller-forwarded SPM connections. | ||
|
|
||
| s.dma.spm //= s.cgra.dma_spm | ||
|
|
||
| def line_trace(s): | ||
| return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| from ..controller.ControllerRTL import ControllerRTL | ||
| from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL | ||
| from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL | ||
| from ..lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL | ||
| from ..lib.basic.val_rdy.queues import BypassQueueRTL | ||
| from ..lib.opt_type import * | ||
| from ..lib.util.common import * | ||
|
|
@@ -83,7 +84,8 @@ def construct(s, CgraPayloadType, | |
| provided_max_per_cgra_rows = None, | ||
| provided_max_per_cgra_cols = None, | ||
| provided_max_num_rd_tiles = None, | ||
| provided_max_num_wr_tiles = None): | ||
| provided_max_num_wr_tiles = None, | ||
| has_dma_ports = False): | ||
| """ | ||
| provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. | ||
| provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. | ||
|
|
@@ -126,6 +128,21 @@ def construct(s, CgraPayloadType, | |
| CtrlRingPos = mk_ring_pos(max_num_tiles + 1) | ||
| CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) | ||
| DataAddrType = mk_bits(clog2(data_mem_size_global)) | ||
| DmaDataType = DataType.get_field_type(kAttrPayload) | ||
| DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) | ||
| DmaOpcodeType = mk_bits(3) | ||
| DmaDramAddrType = mk_bits(64) | ||
| DmaBytesType = mk_bits(32) | ||
| DmaTagType = mk_bits(8) | ||
| DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits, | ||
| DataAddrType.nbits, | ||
| DmaBytesType.nbits, | ||
| DmaTagType.nbits) | ||
| DmaDoneType = mk_dma_done(DmaTagType.nbits) | ||
| DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits, | ||
| DmaDataType.nbits) | ||
| DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits) | ||
| DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits) | ||
| assert(data_mem_size_per_bank * num_banks_per_cgra <= \ | ||
| data_mem_size_global) | ||
|
|
||
|
|
@@ -135,6 +152,42 @@ def construct(s, CgraPayloadType, | |
| s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) | ||
| s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) | ||
|
|
||
| # Optional DMA engine-facing ports. The controller owns command decode and | ||
| # forwards DMA SPM access to the data memory. | ||
| if has_dma_ports: | ||
| s.dma_cmd = SendIfcRTL(DmaCmdType) | ||
| s.dma_cmd_val = s.dma_cmd.val | ||
| s.dma_cmd_rdy = s.dma_cmd.rdy | ||
| s.dma_cmd_opcode = s.dma_cmd.msg.opcode | ||
| s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr | ||
| s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr | ||
| s.dma_cmd_bytes = s.dma_cmd.msg.nbytes | ||
| s.dma_cmd_tag = s.dma_cmd.msg.tag | ||
|
tancheng marked this conversation as resolved.
Outdated
|
||
|
|
||
| s.dma_done = RecvIfcRTL(DmaDoneType) | ||
| s.dma_done_val = s.dma_done.val | ||
| s.dma_done_rdy = s.dma_done.rdy | ||
| s.dma_done_tag = s.dma_done.msg.tag | ||
|
tancheng marked this conversation as resolved.
Outdated
|
||
|
|
||
| s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, | ||
| DmaSpmReadReqType, | ||
| DmaSpmReadRespType) | ||
|
|
||
| # DMA write request interface. | ||
| s.spm_dma_wval = s.dma_spm.write.val # dma write request valid(write data into SPM) | ||
| s.spm_dma_wrdy = s.dma_spm.write.rdy | ||
| s.spm_dma_waddr = s.dma_spm.write.msg.addr | ||
| s.spm_dma_wdata = s.dma_spm.write.msg.data | ||
| s.spm_dma_wmask = s.dma_spm.write.msg.mask | ||
|
|
||
| # DMA read response interface. | ||
| s.spm_dma_rval = s.dma_spm.read.val | ||
| s.spm_dma_rrdy = s.dma_spm.read.rdy | ||
| s.spm_dma_raddr = s.dma_spm.read.msg.addr | ||
| s.spm_dma_rresp_val = s.dma_spm.read_resp.val | ||
| s.spm_dma_rresp_rdy = s.dma_spm.read_resp.rdy | ||
| s.spm_dma_rresp_data = s.dma_spm.read_resp.msg.data | ||
|
|
||
| if is_multi_cgra: | ||
| # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. | ||
| # Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA. | ||
|
|
@@ -168,11 +221,13 @@ def construct(s, CgraPayloadType, | |
| multi_cgra_columns, | ||
| max_num_tiles, | ||
| mem_access_is_combinational, | ||
| idTo2d_map) | ||
| idTo2d_map, | ||
| has_dma_ports) | ||
| s.cgra_id = InPort(CgraIdType) | ||
| s.controller = ControllerRTL(NocPktType, | ||
| multi_cgra_rows, multi_cgra_columns, | ||
| max_num_tiles, controller2addr_map, idTo2d_map) | ||
| max_num_tiles, controller2addr_map, idTo2d_map, | ||
| has_dma_ports) | ||
| # Connects controller id. | ||
| s.controller.cgra_id //= s.cgra_id | ||
| # An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU. | ||
|
|
@@ -190,6 +245,17 @@ def construct(s, CgraPayloadType, | |
| s.data_mem.address_lower //= s.address_lower | ||
| s.data_mem.address_upper //= s.address_upper | ||
|
|
||
| if has_dma_ports: | ||
| # CPU packets are decoded by the controller before becoming DMA commands. | ||
| s.dma_cmd //= s.controller.dma_cmd | ||
| s.dma_done //= s.controller.dma_done | ||
|
|
||
| # DMA engine <-> controller side of the SPM path. | ||
| s.dma_spm //= s.controller.dma_spm_from_dma | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename this?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should have two interfaces, one send and one recv, WDYT? and the "send"/"recv" should be in the name. |
||
|
|
||
| # Controller <-> data memory side of the SPM path. | ||
| s.controller.dma_spm_to_mem //= s.data_mem.dma_spm | ||
|
|
||
| # Connects data memory with controller. | ||
| s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request | ||
| s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Name this as
s.send_dram_rd_req, to indicate its direction.