From 302b35c2a0ac90208e523944e8cc4b44a793d8d5 Mon Sep 17 00:00:00 2001
From: Takeshi ISHII <2170248+mtei@users.noreply.github.com>
Date: Wed, 13 Jan 2021 10:46:22 +0900
Subject: [PATCH] fix matrix_io_delay() timing in quantum/matrix.c (#9603)

* fix matrix_io_delay() timing in quantum/matrix.c

* Updated comments explaining the need for matrix_io_delay() in quantum/matrix.c

* fix matrix_io_delay() timing in quantum/split_common/matrix.c

* Update quantum/matrix.c

Co-authored-by: Ryan <fauxpark@gmail.com>

* Update quantum/split_common/matrix.c

Co-authored-by: Ryan <fauxpark@gmail.com>

* Update quantum/matrix.c

Co-authored-by: Ryan <fauxpark@gmail.com>

* Update quantum/split_common/matrix.c

Co-authored-by: Ryan <fauxpark@gmail.com>

* add waitOutputPinValid() and wait_cpuclock() into quantum/quantum.h and tmk_core/common/wait.h

* add matrix_output_select_delay() and matrix_output_unselect_delay()

* fix quantum/matrix_common.c, tmk_core/common/matrix.h

* fix tmk_core/common/wait.h

* fix quantum/quantum.h, tmk_core/common/wait.h

* waitOutputPinValid() rename to waitInputPinDelay() in quantum/quantum.h.

* waitOutputPinValid() rename to waitInputPinDelay() in quantum/matrix_common.c

* update tmk_core/common/wait.h

* update comment in quantum/matrix.c, quantum/split_common/matrix.c

* update quantum/quantum.h: Make more margin in the GPIO_INPUT_PIN_DELAY default value.

Co-authored-by: Ryan <fauxpark@gmail.com>
---
 quantum/matrix.c              | 14 +++++++---
 quantum/matrix_common.c       |  5 ++++
 quantum/quantum.h             | 29 +++++++++++++++++++
 quantum/split_common/matrix.c | 14 +++++++---
 tmk_core/common/matrix.h      |  3 ++
 tmk_core/common/wait.h        | 52 +++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/quantum/matrix.c b/quantum/matrix.c
index 9083ff3861..c027b7bf27 100644
--- a/quantum/matrix.c
+++ b/quantum/matrix.c
@@ -101,9 +101,9 @@ static bool read_cols_on_row(matrix_row_t current_matrix[], uint8_t current_row)
     // Start with a clear matrix row
     matrix_row_t current_row_value = 0;
 
-    // Select row and wait for row selecton to stabilize
+    // Select row
     select_row(current_row);
-    matrix_io_delay();
+    matrix_output_select_delay();
 
     // For each col...
     for (uint8_t col_index = 0; col_index < MATRIX_COLS; col_index++) {
@@ -116,6 +116,9 @@ static bool read_cols_on_row(matrix_row_t current_matrix[], uint8_t current_row)
 
     // Unselect row
     unselect_row(current_row);
+    if (current_row + 1 < MATRIX_ROWS) {
+        matrix_output_unselect_delay();  // wait for row signal to go HIGH
+    }
 
     // If the row has changed, store the row and return the changed flag.
     if (current_matrix[current_row] != current_row_value) {
@@ -147,9 +150,9 @@ static void init_pins(void) {
 static bool read_rows_on_col(matrix_row_t current_matrix[], uint8_t current_col) {
     bool matrix_changed = false;
 
-    // Select col and wait for col selecton to stabilize
+    // Select col
     select_col(current_col);
-    matrix_io_delay();
+    matrix_output_select_delay();
 
     // For each row...
     for (uint8_t row_index = 0; row_index < MATRIX_ROWS; row_index++) {
@@ -175,6 +178,9 @@ static bool read_rows_on_col(matrix_row_t current_matrix[], uint8_t current_col)
 
     // Unselect col
     unselect_col(current_col);
+    if (current_col + 1 < MATRIX_COLS) {
+        matrix_output_unselect_delay();  // wait for col signal to go HIGH
+    }
 
     return matrix_changed;
 }
diff --git a/quantum/matrix_common.c b/quantum/matrix_common.c
index 15f1e0e82e..01d2b38e5e 100644
--- a/quantum/matrix_common.c
+++ b/quantum/matrix_common.c
@@ -1,3 +1,4 @@
+#include "quantum.h"
 #include "matrix.h"
 #include "debounce.h"
 #include "wait.h"
@@ -83,8 +84,12 @@ uint8_t matrix_key_count(void) {
     return count;
 }
 
+/* `matrix_io_delay ()` exists for backwards compatibility. From now on, use matrix_output_unselect_delay(). */
 __attribute__((weak)) void matrix_io_delay(void) { wait_us(MATRIX_IO_DELAY); }
 
+__attribute__((weak)) void matrix_output_select_delay(void) { waitInputPinDelay(); }
+__attribute__((weak)) void matrix_output_unselect_delay(void) { matrix_io_delay(); }
+
 // CUSTOM MATRIX 'LITE'
 __attribute__((weak)) void matrix_init_custom(void) {}
 
diff --git a/quantum/quantum.h b/quantum/quantum.h
index 83694c8324..d234e6ea0d 100644
--- a/quantum/quantum.h
+++ b/quantum/quantum.h
@@ -210,6 +210,13 @@ typedef uint8_t pin_t;
 
 #    define togglePin(pin) (PORTx_ADDRESS(pin) ^= _BV((pin)&0xF))
 
+/*   The AVR series GPIOs have a one clock read delay for changes in the digital input signal.
+ *   But here's more margin to make it two clocks. */
+#    if !defined(GPIO_INPUT_PIN_DELAY)
+#        define GPIO_INPUT_PIN_DELAY 2
+#    endif
+#    define waitInputPinDelay() wait_cpuclock(GPIO_INPUT_PIN_DELAY)
+
 #elif defined(PROTOCOL_CHIBIOS)
 typedef ioline_t pin_t;
 
@@ -225,6 +232,28 @@ typedef ioline_t pin_t;
 #    define readPin(pin) palReadLine(pin)
 
 #    define togglePin(pin) palToggleLine(pin)
+
+#endif
+
+#if defined(__ARMEL__) || defined(__ARMEB__)
+/* For GPIOs on ARM-based MCUs, the input pins are sampled by the clock of the bus
+ * to which the GPIO is connected.
+ * The connected buses differ depending on the various series of MCUs.
+ * And since the instruction execution clock of the CPU and the bus clock of GPIO are different,
+ * there is a delay of several clocks to read the change of the input signal.
+ *
+ * Define this delay with the GPIO_INPUT_PIN_DELAY macro.
+ * If the GPIO_INPUT_PIN_DELAY macro is not defined, the following default values will be used.
+ * (A fairly large value of 0.25 microseconds is set.)
+ */
+#    if !defined(GPIO_INPUT_PIN_DELAY)
+#        if defined(STM32_SYSCLK)
+#            define GPIO_INPUT_PIN_DELAY (STM32_SYSCLK/1000000L / 4)
+#        elif defined(KINETIS_SYSCLK_FREQUENCY)
+#            define GPIO_INPUT_PIN_DELAY (KINETIS_SYSCLK_FREQUENCY/1000000L / 4)
+#        endif
+#    endif
+#    define waitInputPinDelay() wait_cpuclock(GPIO_INPUT_PIN_DELAY)
 #endif
 
 // Atomic macro to help make GPIO and other controls atomic.
diff --git a/quantum/split_common/matrix.c b/quantum/split_common/matrix.c
index 06bab734e7..067815c991 100644
--- a/quantum/split_common/matrix.c
+++ b/quantum/split_common/matrix.c
@@ -114,9 +114,9 @@ static bool read_cols_on_row(matrix_row_t current_matrix[], uint8_t current_row)
     // Start with a clear matrix row
     matrix_row_t current_row_value = 0;
 
-    // Select row and wait for row selecton to stabilize
+    // Select row
     select_row(current_row);
-    matrix_io_delay();
+    matrix_output_select_delay();
 
     // For each col...
     for (uint8_t col_index = 0; col_index < MATRIX_COLS; col_index++) {
@@ -129,6 +129,9 @@ static bool read_cols_on_row(matrix_row_t current_matrix[], uint8_t current_row)
 
     // Unselect row
     unselect_row(current_row);
+    if (current_row + 1 < MATRIX_ROWS) {
+        matrix_output_unselect_delay();  // wait for row signal to go HIGH
+    }
 
     // If the row has changed, store the row and return the changed flag.
     if (current_matrix[current_row] != current_row_value) {
@@ -160,9 +163,9 @@ static void init_pins(void) {
 static bool read_rows_on_col(matrix_row_t current_matrix[], uint8_t current_col) {
     bool matrix_changed = false;
 
-    // Select col and wait for col selecton to stabilize
+    // Select col
     select_col(current_col);
-    matrix_io_delay();
+    matrix_output_select_delay();
 
     // For each row...
     for (uint8_t row_index = 0; row_index < ROWS_PER_HAND; row_index++) {
@@ -188,6 +191,9 @@ static bool read_rows_on_col(matrix_row_t current_matrix[], uint8_t current_col)
 
     // Unselect col
     unselect_col(current_col);
+    if (current_col + 1 < MATRIX_COLS) {
+        matrix_output_unselect_delay();  // wait for col signal to go HIGH
+    }
 
     return matrix_changed;
 }
diff --git a/tmk_core/common/matrix.h b/tmk_core/common/matrix.h
index b570227a31..ce57010a47 100644
--- a/tmk_core/common/matrix.h
+++ b/tmk_core/common/matrix.h
@@ -55,6 +55,9 @@ matrix_row_t matrix_get_row(uint8_t row);
 /* print matrix for debug */
 void matrix_print(void);
 /* delay between changing matrix pin state and reading values */
+void matrix_output_select_delay(void);
+void matrix_output_unselect_delay(void);
+/* only for backwards compatibility. delay between changing matrix pin state and reading values */
 void matrix_io_delay(void);
 
 /* power control */
diff --git a/tmk_core/common/wait.h b/tmk_core/common/wait.h
index 89128e9daf..0b3fd755a9 100644
--- a/tmk_core/common/wait.h
+++ b/tmk_core/common/wait.h
@@ -6,10 +6,62 @@
 extern "C" {
 #endif
 
+#if defined(__ARMEL__) || defined(__ARMEB__)
+#    ifndef __OPTIMIZE__
+#        pragma message "Compiler optimizations disabled; wait_cpuclock() won't work as designed"
+#    endif
+
+#    define wait_cpuclock(x) wait_cpuclock_allnop(x)
+
+#    define CLOCK_DELAY_NOP8 "nop\n\t nop\n\t nop\n\t nop\n\t   nop\n\t nop\n\t nop\n\t nop\n\t"
+
+__attribute__((always_inline))
+static inline void wait_cpuclock_allnop(unsigned int n) { /* n: 1..135 */
+    /* The argument n must be a constant expression.
+     * That way, compiler optimization will remove unnecessary code. */
+    if (n < 1) { return; }
+    if (n > 8) {
+        unsigned int n8 = n/8;
+        n = n - n8*8;
+        switch (n8) {
+        case 16: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 15: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 14: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 13: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 12: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 11: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case 10: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  9: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  8: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  7: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  6: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  5: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  4: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  3: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  2: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  1: asm volatile (CLOCK_DELAY_NOP8::: "memory");
+        case  0: break;
+        }
+    }
+    switch (n) {
+    case 8: asm volatile ("nop"::: "memory");
+    case 7: asm volatile ("nop"::: "memory");
+    case 6: asm volatile ("nop"::: "memory");
+    case 5: asm volatile ("nop"::: "memory");
+    case 4: asm volatile ("nop"::: "memory");
+    case 3: asm volatile ("nop"::: "memory");
+    case 2: asm volatile ("nop"::: "memory");
+    case 1: asm volatile ("nop"::: "memory");
+    case 0: break;
+    }
+}
+#endif
+
 #if defined(__AVR__)
 #    include <util/delay.h>
 #    define wait_ms(ms) _delay_ms(ms)
 #    define wait_us(us) _delay_us(us)
+#    define wait_cpuclock(x) __builtin_avr_delay_cycles(x)
 #elif defined PROTOCOL_CHIBIOS
 #    include <ch.h>
 #    define wait_ms(ms)                     \