Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
308a213
Add the DmaEngine implementation and the test.
BenkangPeng Jun 1, 2026
90d9eef
[Test] Update the test of DmaEngine.
BenkangPeng Jun 1, 2026
5e615e1
Add DMA support to DataMemControllerRTL and implement corresponding t…
BenkangPeng Jun 2, 2026
30bdc36
Add the dma ports into CgraTemplateRTL
BenkangPeng Jun 2, 2026
e6c0b3b
Wrap the Cgra and Dma into one single module.
BenkangPeng Jun 2, 2026
c3f3dc4
[Script] Add the local_CI script file
BenkangPeng Jun 2, 2026
046c860
Update .gitignore to ignore the log file
BenkangPeng Jun 2, 2026
4359f1f
[Test] Add the test for CgraDmaRTL
BenkangPeng Jun 2, 2026
aff3a8a
[Fix] Fix the bit mismatch error between dma_idx and num_xbar_in_ports.
BenkangPeng Jun 2, 2026
b2e41e8
[Doc] Add some comments
BenkangPeng Jun 2, 2026
5fc388c
[Fix] Fix the bit mismatch by type convertion
BenkangPeng Jun 3, 2026
70ae3da
Move some constant into common header file
BenkangPeng Jun 3, 2026
fc589c5
[Refactor] Wrap the signals between dma and dram with SendIfcRTL and …
BenkangPeng Jun 3, 2026
17d282c
[Refactor] Update DMA command handling in CgraDmaRTL and CgraTemplate…
BenkangPeng Jun 13, 2026
4b98641
[Refactor] Simplify DMA interface connections in CgraDmaRTL, CgraTemp…
BenkangPeng Jun 13, 2026
e69c3de
[Fix] Use Outport instead of Wire in DmaWireIfcRTL to avoid the RTLIR…
BenkangPeng Jun 14, 2026
82ac18d
[CleanUp] Remove the unnecessary ports.
BenkangPeng Jun 14, 2026
1a1172b
[Feature] Introduce DMA data structure and DMA-to-DRAM write request …
BenkangPeng Jun 14, 2026
94cc7d0
[Refactor] Pass DmaCmdType and DmaDataType into DataMemController and…
BenkangPeng Jun 14, 2026
65fb185
[Refactor] Update DmaEngineRTL to use DmaDramWrReqIfcRTL for DRAM wri…
BenkangPeng Jun 14, 2026
d2d7e7d
[Refactor] Enhance DMA integration in CgraTemplateRTL and ControllerR…
BenkangPeng Jun 14, 2026
28b263b
[Refactor] Update CgraDmaRTL to utilize DmaDramWrReqIfcRTL for DRAM w…
BenkangPeng Jun 14, 2026
3af7d8b
[Fix] Fix the bitwidth mismatch error between DataType and DmaSpmData…
BenkangPeng Jun 15, 2026
326167d
[CleanUp] Update DMA attribute references to use new constants for im…
BenkangPeng Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ build
__pycache__
.hypothesis
.vscode
*.log
184 changes: 184 additions & 0 deletions cgra/CgraDmaRTL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
=========================================================================
CgraDmaRTL.py
=========================================================================

Wrapper that composes a CGRA template with a DMA engine attached to the
CGRA data SPM.
"""

from pymtl3 import *

from .CgraTemplateRTL import CgraTemplateRTL
from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL
from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL
from ..lib.messages import *
from ..lib.util.data_struct_attr import *
from ..mem.dma.DmaEngineRTL import DmaEngineRTL


class CgraDmaRTL( Component ):
"""
CgraDmaRTL is a top-level wrapper that integrates a CGRA instance with a
DMA engine.

Architectural Design:
- It instantiates a standard CGRA template (`CgraTemplateRTL`) and a
DMA engine (`DmaEngineRTL`).
- CPU control packets are passed through to the CGRA's controller.
DMA commands are decoded there.
- The DMA engine accesses the CGRA's internal data SPM through controller-
forwarded ports; it is not connected directly to `DataMemControllerRTL`.
- External memory requests from the DMA engine are exposed at the top level
to be connected to a DRAM model or an AXI adapter.
- Boundary data ports for multi-CGRA configurations are also passed through
if enabled.
"""

def construct(s, CgraPayloadType,
multi_cgra_rows,
multi_cgra_columns,
per_cgra_rows, per_cgra_columns,
ctrl_mem_size, data_mem_size_global,
data_mem_size_per_bank, num_banks_per_cgra,
num_registers_per_reg_bank, num_ctrl,
total_steps, mem_access_is_combinational,
FunctionUnit, FuList, TileList, LinkList,
dataSPM, controller2addr_map, idTo2d_map,
is_multi_cgra = True, cgra_id = 0,
# For heterogeneous multi-cgra support.(maybe remove it in CgraDmaRTL for simplicity?)
provided_max_per_cgra_rows = None,
provided_max_per_cgra_cols = None,
provided_max_num_rd_tiles = None,
provided_max_num_wr_tiles = None):

DataType = CgraPayloadType.get_field_type(kAttrData)
data_bitwidth = DataType.get_field_type(kAttrPayload).nbits
assert data_bitwidth == 32

max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows
max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns
max_num_tiles = max_per_cgra_rows * max_per_cgra_cols
max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts()

CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows,
max_num_tiles, CgraPayloadType)
NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows,
max_num_tiles, max_num_rd_tiles,
CgraPayloadType)

CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows)
DataAddrType = mk_bits(clog2(data_mem_size_global))
DmaDramAddrType = mk_bits(64)
DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM
DmaMemMaskType = mk_bits(16)

# Existing CGRA-facing interfaces.
# CGRA <-> CPU
s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType)
s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType)

if is_multi_cgra:
s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType)
s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType)

s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
s.recv_data_on_boundary_west = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
s.send_data_on_boundary_west = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
s.recv_data_on_boundary_east = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)]

s.cgra_id = InPort(CgraIdType)
# The local address range of current CGRA.
# Any address out of this range will be assumed as remote address.
s.address_lower = InPort(DataAddrType)
s.address_upper = InPort(DataAddrType)

# Abstract external dram memory interfaces for the internal DMA engine.

s.dram_rd_req = SendIfcRTL(DmaDramAddrType)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name this as s.send_dram_rd_req, to indicate its direction.

s.dram_rd_resp = RecvIfcRTL(DmaMemDataType)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

name this as s.recv_dram_rd_resp.


s.dram_wr_req_val = OutPort()
s.dram_wr_req_rdy = InPort()
s.dram_wr_req_addr = OutPort(DmaDramAddrType)
s.dram_wr_req_data = OutPort(DmaMemDataType)
s.dram_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM

s.dram_wr_resp_val = InPort()
s.dram_wr_resp_rdy = OutPort()

# Components.

s.cgra = CgraTemplateRTL(CgraPayloadType,

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually wondering why we need this CgraDmaRTL.py. Can we just expand the CgraTemplateRTL to include the DMA-related ports/interfaces?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially designed the DMA module first. To integrate the CGRA with the DMA, I wrote a wrapper module(CgraDmaRTL) that instantiates and connects both the CGRA and the DMA.

I feel that directly instantiating and connecting a DMA inside CgraTemplateRTL would make the CGRA bloated. After all, many use cases don't require a DMA at al, and it doesn't align well with hierarchical modular design. I thought the CGRA and DMA should be separate modules, and a higher-level module should instantiate and connect them. WDYT?

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. Then let's rename this file/module to IntegratedDmaWithCgraRTL?

And what about multi-cgra then? You will have another IntegratedDmaWithMultiCgraRTL that contains one DMA and a multi-cgra, correct?

multi_cgra_rows,
multi_cgra_columns,
per_cgra_rows, per_cgra_columns,
ctrl_mem_size, data_mem_size_global,
data_mem_size_per_bank, num_banks_per_cgra,
num_registers_per_reg_bank, num_ctrl,
total_steps, mem_access_is_combinational,
FunctionUnit, FuList, TileList, LinkList,
dataSPM, controller2addr_map, idTo2d_map,
is_multi_cgra, cgra_id,
provided_max_per_cgra_rows,
provided_max_per_cgra_cols,
provided_max_num_rd_tiles,
provided_max_num_wr_tiles,
has_dma_ports = True)

s.dma = DmaEngineRTL(spm_data_nbits = data_bitwidth,
spm_addr_nbits = clog2(data_mem_size_global))

# CGRA passthrough connections.

s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt
s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt

if is_multi_cgra:
s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc
s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc

for i in range(max_per_cgra_cols):
s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i]
s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i]
s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i]
s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i]

for i in range(max_per_cgra_rows):
s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i]
s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i]
s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i]
s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i]

s.cgra_id //= s.cgra.cgra_id
s.address_lower //= s.cgra.address_lower
s.address_upper //= s.cgra.address_upper


# Connections between CGRA and DMA engine.
# CGRA communicates with DMA engine through the controller.
s.cgra.dma_cmd //= s.dma.dma_cmd
s.dma.dma_done //= s.cgra.dma_done

s.dram_rd_req //= s.dma.dram_rd_req
s.dram_rd_resp //= s.dma.dram_rd_resp

s.dram_wr_req_val //= s.dma.dram_wr_req_val
s.dram_wr_req_rdy //= s.dma.dram_wr_req_rdy
s.dram_wr_req_addr //= s.dma.dram_wr_req_addr
s.dram_wr_req_data //= s.dma.dram_wr_req_data
s.dram_wr_req_mask //= s.dma.dram_wr_req_mask

s.dram_wr_resp_val //= s.dma.dram_wr_resp_val
s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy

# DMA to controller-forwarded SPM connections.

s.dma.spm //= s.cgra.dma_spm

def line_trace(s):
return f"{s.dma.line_trace()} || {s.cgra.line_trace()}"
72 changes: 69 additions & 3 deletions cgra/CgraTemplateRTL.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ..controller.ControllerRTL import ControllerRTL
from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL
from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL
from ..lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL
from ..lib.basic.val_rdy.queues import BypassQueueRTL
from ..lib.opt_type import *
from ..lib.util.common import *
Expand Down Expand Up @@ -83,7 +84,8 @@ def construct(s, CgraPayloadType,
provided_max_per_cgra_rows = None,
provided_max_per_cgra_cols = None,
provided_max_num_rd_tiles = None,
provided_max_num_wr_tiles = None):
provided_max_num_wr_tiles = None,
has_dma_ports = False):
"""
provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch.
provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch.
Expand Down Expand Up @@ -126,6 +128,21 @@ def construct(s, CgraPayloadType,
CtrlRingPos = mk_ring_pos(max_num_tiles + 1)
CtrlAddrType = mk_bits(clog2(ctrl_mem_size))
DataAddrType = mk_bits(clog2(data_mem_size_global))
DmaDataType = DataType.get_field_type(kAttrPayload)
DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT))
DmaOpcodeType = mk_bits(3)
DmaDramAddrType = mk_bits(64)
DmaBytesType = mk_bits(32)
DmaTagType = mk_bits(8)
DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits,
DataAddrType.nbits,
DmaBytesType.nbits,
DmaTagType.nbits)
DmaDoneType = mk_dma_done(DmaTagType.nbits)
DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits,
DmaDataType.nbits)
DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits)
DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits)
assert(data_mem_size_per_bank * num_banks_per_cgra <= \
data_mem_size_global)

Expand All @@ -135,6 +152,42 @@ def construct(s, CgraPayloadType,
s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType)
s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType)

# Optional DMA engine-facing ports. The controller owns command decode and
# forwards DMA SPM access to the data memory.
if has_dma_ports:
s.dma_cmd = SendIfcRTL(DmaCmdType)
s.dma_cmd_val = s.dma_cmd.val
s.dma_cmd_rdy = s.dma_cmd.rdy
s.dma_cmd_opcode = s.dma_cmd.msg.opcode
s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr
s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr
s.dma_cmd_bytes = s.dma_cmd.msg.nbytes
s.dma_cmd_tag = s.dma_cmd.msg.tag
Comment thread
tancheng marked this conversation as resolved.
Outdated

s.dma_done = RecvIfcRTL(DmaDoneType)
s.dma_done_val = s.dma_done.val
s.dma_done_rdy = s.dma_done.rdy
s.dma_done_tag = s.dma_done.msg.tag
Comment thread
tancheng marked this conversation as resolved.
Outdated

s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType,
DmaSpmReadReqType,
DmaSpmReadRespType)

# DMA write request interface.
s.spm_dma_wval = s.dma_spm.write.val # dma write request valid(write data into SPM)
s.spm_dma_wrdy = s.dma_spm.write.rdy
s.spm_dma_waddr = s.dma_spm.write.msg.addr
s.spm_dma_wdata = s.dma_spm.write.msg.data
s.spm_dma_wmask = s.dma_spm.write.msg.mask

# DMA read response interface.
s.spm_dma_rval = s.dma_spm.read.val
s.spm_dma_rrdy = s.dma_spm.read.rdy
s.spm_dma_raddr = s.dma_spm.read.msg.addr
s.spm_dma_rresp_val = s.dma_spm.read_resp.val
s.spm_dma_rresp_rdy = s.dma_spm.read_resp.rdy
s.spm_dma_rresp_data = s.dma_spm.read_resp.msg.data

if is_multi_cgra:
# Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra.
# Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA.
Expand Down Expand Up @@ -168,11 +221,13 @@ def construct(s, CgraPayloadType,
multi_cgra_columns,
max_num_tiles,
mem_access_is_combinational,
idTo2d_map)
idTo2d_map,
has_dma_ports)
s.cgra_id = InPort(CgraIdType)
s.controller = ControllerRTL(NocPktType,
multi_cgra_rows, multi_cgra_columns,
max_num_tiles, controller2addr_map, idTo2d_map)
max_num_tiles, controller2addr_map, idTo2d_map,
has_dma_ports)
# Connects controller id.
s.controller.cgra_id //= s.cgra_id
# An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU.
Expand All @@ -190,6 +245,17 @@ def construct(s, CgraPayloadType,
s.data_mem.address_lower //= s.address_lower
s.data_mem.address_upper //= s.address_upper

if has_dma_ports:
# CPU packets are decoded by the controller before becoming DMA commands.
s.dma_cmd //= s.controller.dma_cmd
s.dma_done //= s.controller.dma_done

# DMA engine <-> controller side of the SPM path.
s.dma_spm //= s.controller.dma_spm_from_dma

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename this? dma_spm and dma_spm_from_dma sound confusing.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about s.dma_spm_from_dma //= s.controller.dma_spm_from_dma? My original idea was that dma_spm indicates the signal is used for communication between DMA and SPM, with suffixes like from_dma and to_mem indicating the direction of the signal.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have two interfaces, one send and one recv, WDYT? and the "send"/"recv" should be in the name.


# Controller <-> data memory side of the SPM path.
s.controller.dma_spm_to_mem //= s.data_mem.dma_spm

# Connects data memory with controller.
s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request
s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request
Expand Down
Loading
Loading