diff --git a/b_asic/codegen/vhdl/__init__.py b/b_asic/codegen/vhdl/__init__.py
index 77bca662d997823234f06498ed98434d36baf872..e69765319fb6154b15b38024995ec6d86fa39979 100644
--- a/b_asic/codegen/vhdl/__init__.py
+++ b/b_asic/codegen/vhdl/__init__.py
@@ -2,7 +2,34 @@
 Module for basic VHDL code generation.
 """
 
+from io import TextIOWrapper
+
 # VHDL code generation tab length
 VHDL_TAB = r"    "
 
+
+def write(
+    f: TextIOWrapper,
+    indent_level: int,
+    text: str,
+    end: str = '\n',
+):
+    """
+    Base VHDL code generation utility. `f'{VHDL_TAB*indent_level}'` is first written to the :class:`io.TextIOWrapper`
+    object `f`. Immediatly after the indentation, `text` is written to `f`. Finally, `text` is also written to `f`.
+
+    Parameters
+    ----------
+    f : :class:`io.TextIOWrapper`
+        The file object to emit the VHDL code to.
+    indent_level : int
+        Indentation level to use. Exactly `f'{VHDL_TAB*indent_level}` is written before the text is written.
+    text : str
+        The text to write to.
+    end : str, default: '\n'
+        Text to write exactly after `text` is written to `f`.
+    """
+    f.write(f'{VHDL_TAB*indent_level}{text}{end}')
+
+
 from b_asic.codegen.vhdl import architecture, common, entity
diff --git a/b_asic/codegen/vhdl/architecture.py b/b_asic/codegen/vhdl/architecture.py
index 80c933b9f57a6f1f24332342455869c7d30cfde6..0ab01d1c5f4d530491b9264ac8713cb60fe4412f 100644
--- a/b_asic/codegen/vhdl/architecture.py
+++ b/b_asic/codegen/vhdl/architecture.py
@@ -2,7 +2,7 @@
 Module for code generation of VHDL architectures.
 """
 from io import TextIOWrapper
-from typing import Set, cast
+from typing import Dict, List, Set, Tuple, cast
 
 from b_asic.codegen import vhdl
 from b_asic.codegen.vhdl import VHDL_TAB
@@ -263,8 +263,29 @@ def write_register_based_storage(
 ):
     architecture_name = "rtl"
     schedule_time = len(forward_backward_table)
+
+    # Number of registers in this design
     reg_cnt = len(forward_backward_table[0].regs)
 
+    # Set of the register indices to output from
+    output_regs = {entry.outputs_from for entry in forward_backward_table.table}
+    if None in output_regs:
+        output_regs.remove(None)
+    output_regs = cast(Set[int], output_regs)
+
+    # Table with mapping: register to output multiplexer index
+    output_mux_table = {reg: i for i, reg in enumerate(output_regs)}
+
+    # Back-edge register indices
+    back_edges: Set[Tuple[int, int]] = {
+        (frm, to)
+        for entry in forward_backward_table
+        for frm, to in entry.back_edge_to.items()
+    }
+    back_edge_table: Dict[Tuple[int, int], int] = {
+        edge: i + 1 for i, edge in enumerate(back_edges)
+    }
+
     #
     # Architecture declerative region begin
     #
@@ -277,7 +298,7 @@ def write_register_based_storage(
         f,
         name='schedule_cnt',
         type=f'integer range 0 to {schedule_time}-1',
-        name_pad=14,
+        name_pad=18,
         default_value='0',
     )
 
@@ -292,17 +313,25 @@ def write_register_based_storage(
         f,
         name='shift_reg',
         type='shift_reg_type',
-        name_pad=14,
+        name_pad=18,
+    )
+
+    # Back edge mux decoder
+    f.write(f'\n{VHDL_TAB}-- Back-edge mux select signal\n')
+    vhdl.common.write_signal_decl(
+        f,
+        name='back_edge_mux_sel',
+        type=f'integer range 0 to {len(back_edges)}',
+        name_pad=18,
     )
 
     # Output mux selector
     f.write(f'\n{VHDL_TAB}-- Output mux select signal\n')
-    output_regs = {entry.outputs_from for entry in forward_backward_table.table}
     vhdl.common.write_signal_decl(
         f,
         name='out_mux_sel',
         type=f'integer range 0 to {len(output_regs)-1}',
-        name_pad=14,
+        name_pad=18,
     )
 
     #
@@ -326,6 +355,31 @@ def write_register_based_storage(
         ),
     )
 
+    # Shift register back-edge decoding
+    f.write(f'\n{VHDL_TAB}-- Shift register back-edge decoding\n')
+    vhdl.common.write_synchronous_process_prologue(
+        f,
+        clk='clk',
+        name='shift_reg_back_edge_decode_proc',
+    )
+    vhdl.write(f, 3, f'case schedule_cnt is')
+    for time, entry in enumerate(forward_backward_table):
+        if entry.back_edge_to:
+            assert len(entry.back_edge_to) == 1
+            for src, dst in entry.back_edge_to.items():
+                mux_idx = back_edge_table[(src, dst)]
+                vhdl.write(f, 4, f'when {(time-1)%schedule_time} =>')
+                vhdl.write(f, 5, f'-- ({src} -> {dst})')
+                vhdl.write(f, 5, f'back_edge_mux_sel <= {mux_idx};')
+    vhdl.write(f, 4, f'when others =>')
+    vhdl.write(f, 5, f'back_edge_mux_sel <= 0;')
+    vhdl.write(f, 3, f'end case;')
+    vhdl.common.write_synchronous_process_epilogue(
+        f,
+        clk='clk',
+        name='shift_reg_back_edge_decode_proc',
+    )
+
     # Shift register multiplexer logic
     f.write(f'\n{VHDL_TAB}-- Multiplexers for shift register\n')
     vhdl.common.write_synchronous_process_prologue(
@@ -337,13 +391,16 @@ def write_register_based_storage(
     f.write(f'{3*VHDL_TAB}shift_reg(0) <= p_0_in;\n')
     for reg_idx in range(1, reg_cnt):
         f.write(f'{3*VHDL_TAB}shift_reg({reg_idx}) <= shift_reg({reg_idx-1});\n')
-
-    f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
-    for i, entry in enumerate(forward_backward_table):
-        if entry.back_edge_from:
-            f.write(f'{4*VHDL_TAB} when {schedule_time-1 if (i-1)<0 else (i-1)} =>\n')
-            for dst, src in entry.back_edge_from.items():
-                f.write(f'{5*VHDL_TAB} shift_reg({dst}) <= shift_reg({src});\n')
+    vhdl.write(f, 3, f'case back_edge_mux_sel is')
+    for edge, mux_sel in back_edge_table.items():
+        vhdl.write(f, 4, f'when {mux_sel} =>')
+        vhdl.write(f, 5, f'shift_reg({edge[1]}) <= shift_reg({edge[0]});')
+    # f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
+    # for i, entry in enumerate(forward_backward_table):
+    #    if entry.back_edge_from:
+    #        f.write(f'{4*VHDL_TAB} when {schedule_time-1 if (i-1)<0 else (i-1)} =>\n')
+    #        for dst, src in entry.back_edge_from.items():
+    #            f.write(f'{5*VHDL_TAB} shift_reg({dst}) <= shift_reg({src});\n')
     f.write(f'{4*VHDL_TAB}when others => null;\n')
     f.write(f'{3*VHDL_TAB}end case;\n')
 
@@ -353,29 +410,37 @@ def write_register_based_storage(
         name='shift_reg_proc',
     )
 
+    # Output multiplexer decoding logic
+    f.write(f'\n{VHDL_TAB}-- Output muliplexer decoding logic\n')
+    vhdl.common.write_synchronous_process_prologue(
+        f, clk='clk', name='out_mux_decode_proc'
+    )
+    f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
+    for i, entry in enumerate(forward_backward_table):
+        if entry.outputs_from is not None:
+            f.write(f'{4*VHDL_TAB}when {(i-1)%schedule_time} =>\n')
+            f.write(
+                f'{5*VHDL_TAB}out_mux_sel <= {output_mux_table[entry.outputs_from]};\n'
+            )
+    f.write(f'{3*VHDL_TAB}end case;\n')
+    vhdl.common.write_synchronous_process_epilogue(
+        f, clk='clk', name='out_mux_decode_proc'
+    )
+
     # Output multiplexer logic
     f.write(f'\n{VHDL_TAB}-- Output muliplexer\n')
-    f.write(f'\n{VHDL_TAB}-- {output_regs}\n')
-    f.write(f'\n{VHDL_TAB}-- { list(range(len(output_regs))) }\n')
     vhdl.common.write_synchronous_process_prologue(
         f,
         clk='clk',
         name='out_mux_proc',
     )
-    f.write(f'{3*VHDL_TAB}-- Default case\n')
-    f.write(f'{3*VHDL_TAB}p_0_out <= shift_reg({reg_cnt-1});\n')
-    f.write(f'{3*VHDL_TAB}case schedule_cnt is\n')
-    for i, entry in enumerate(forward_backward_table):
-        if entry.outputs_from is not None:
-            if entry.outputs_from != reg_cnt - 1:
-                f.write(f'{4*VHDL_TAB} when {i} =>\n')
-                if entry.outputs_from < 0:
-                    f.write(f'{5*VHDL_TAB} p_0_out <= p_{-1-entry.outputs_from}_in;\n')
-                else:
-                    f.write(
-                        f'{5*VHDL_TAB} p_0_out <= shift_reg({entry.outputs_from});\n'
-                    )
-    f.write(f'{4*VHDL_TAB}when others => null;\n')
+    f.write(f'{3*VHDL_TAB}case out_mux_sel is\n')
+    for reg_i, mux_i in output_mux_table.items():
+        f.write(f'{4*VHDL_TAB}when {mux_i} =>\n')
+        if reg_i < 0:
+            f.write(f'{5*VHDL_TAB}p_0_out <= p_{-1-reg_i}_in;\n')
+        else:
+            f.write(f'{5*VHDL_TAB}p_0_out <= shift_reg({reg_i});\n')
     f.write(f'{3*VHDL_TAB}end case;\n')
     vhdl.common.write_synchronous_process_epilogue(
         f,
diff --git a/b_asic/codegen/vhdl/common.py b/b_asic/codegen/vhdl/common.py
index 3248aaab0fbadc6014bab8e65f730b8bade2607a..6dbcb397825e682179db6a1a4f1aa5f6cf1879ea 100644
--- a/b_asic/codegen/vhdl/common.py
+++ b/b_asic/codegen/vhdl/common.py
@@ -7,6 +7,7 @@ from io import TextIOWrapper
 from subprocess import PIPE, Popen
 from typing import Any, Optional, Set, Tuple
 
+from b_asic.codegen import vhdl
 from b_asic.codegen.vhdl import VHDL_TAB
 
 
@@ -26,13 +27,13 @@ def write_b_asic_vhdl_preamble(f: TextIOWrapper):
         git_commit_id = process.communicate()[0].decode('utf-8').strip()
     except:
         pass
-    f.write(f'--\n')
-    f.write(f'-- This code was automatically generated by the B-ASIC toolbox.\n')
-    f.write(f'-- Code generation timestamp: ({datetime.now()})\n')
+    vhdl.write(f, 0, f'--')
+    vhdl.write(f, 0, f'-- This code was automatically generated by the B-ASIC toolbox.')
+    vhdl.write(f, 0, f'-- Code generation timestamp: ({datetime.now()})')
     if git_commit_id:
-        f.write(f'-- B-ASIC short commit hash: {git_commit_id}\n')
-    f.write(f'-- URL: https://gitlab.liu.se/da/B-ASIC\n')
-    f.write(f'--\n\n')
+        vhdl.write(f, 0, f'-- B-ASIC short commit hash: {git_commit_id}')
+    vhdl.write(f, 0, f'-- URL: https://gitlab.liu.se/da/B-ASIC')
+    vhdl.write(f, 0, f'--', end='\n\n')
 
 
 def write_ieee_header(
@@ -52,12 +53,12 @@ def write_ieee_header(
     numeric_std : bool, default: True
         Include the numeric_std header.
     """
-    f.write('library ieee;\n')
+    vhdl.write(f, 0, 'library ieee;')
     if std_logic_1164:
-        f.write('use ieee.std_logic_1164.all;\n')
+        vhdl.write(f, 0, 'use ieee.std_logic_1164.all;')
     if numeric_std:
-        f.write('use ieee.numeric_std.all;\n')
-    f.write('\n')
+        vhdl.write(f, 0, 'use ieee.numeric_std.all;')
+    vhdl.write(f, 0, '')
 
 
 def write_signal_decl(
@@ -95,21 +96,19 @@ def write_signal_decl(
     """
     # Spacing of VHDL signals declaration always with a single tab
     name_pad = 0 if name_pad is None else name_pad
-    f.write(f'{VHDL_TAB}signal {name:<{name_pad}} : {type}')
+    vhdl.write(f, 1, f'signal {name:<{name_pad}} : {type}', end='')
     if default_value is not None:
-        f.write(f' := {default_value}')
-    f.write(f';\n')
+        vhdl.write(f, 0, f' := {default_value}', end='')
+    vhdl.write(f, 0, ';')
     if vivado_ram_style:
-        f.write(f'{VHDL_TAB}attribute ram_style : string;\n')
-        f.write(
-            f'{VHDL_TAB}attribute ram_style of {name} : signal is'
-            f' "{vivado_ram_style}";\n'
+        vhdl.write(f, 1, f'attribute ram_style : string;')
+        vhdl.write(
+            f, 1, f'attribute ram_style of {name} : signal is "{vivado_ram_style}";'
         )
     if quartus_ram_style:
-        f.write(f'{VHDL_TAB}attribute ramstyle : string;\n')
-        f.write(
-            f'{VHDL_TAB}attribute ramstyle of {name} : signal is'
-            f' "{quartus_ram_style}";\n'
+        vhdl.write(f, 1, f'attribute ramstyle : string;')
+        vhdl.write(
+            f, 1, f'attribute ramstyle of {name} : signal is "{quartus_ram_style}";'
         )
 
 
@@ -138,7 +137,7 @@ def write_constant_decl(
         An optional left padding value applied to the name.
     """
     name_pad = 0 if name_pad is None else name_pad
-    f.write(f'{VHDL_TAB}constant {name:<{name_pad}} : {type} := {str(value)};\n')
+    vhdl.write(f, 1, f'constant {name:<{name_pad}} : {type} := {str(value)};')
 
 
 def write_type_decl(
@@ -158,7 +157,7 @@ def write_type_decl(
     alias : str
         The type to tie the new name to.
     """
-    f.write(f'{VHDL_TAB}type {name} is {alias};\n')
+    vhdl.write(f, 1, f'type {name} is {alias};')
 
 
 def write_process_prologue(
diff --git a/b_asic/resources.py b/b_asic/resources.py
index 69ff6ccfd111e4c386ee75d77f6c58b2e2ac73c5..2f582e3fe950ce147c0790fa22b4e345ea86c505 100644
--- a/b_asic/resources.py
+++ b/b_asic/resources.py
@@ -171,7 +171,9 @@ class _ForwardBackwardEntry:
         regs : List[Optional[Process]], optional
             regs
         back_edge_to : dict, optional
-        back_edge_from : List[Optional[Process]], optional
+            Dictionary containing back edges of this entry to registers in the next entry.
+        back_edge_from : dict, optional
+            Dictionary containing the back edge of the previous entry to registers in this entry.
         outputs_from : int, optional
         """
         self.inputs: List[Process] = [] if inputs is None else inputs
@@ -304,6 +306,7 @@ class _ForwardBackwardTable:
                     if nreg is None:
                         next_entry.regs[nreg_idx] = reg
                         entry.back_edge_to[cols - 1] = nreg_idx
+                        next_entry.back_edge_from[nreg_idx] = cols - 1
                         return
 
         # All passes failed, raise exception...