diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S
index 3327f31653ee55399df12caf89dcf78806d652bb..0639d8170afb13a4577f548b8fc185f31a8305fe 100644
--- a/bl31/bl31.ld.S
+++ b/bl31/bl31.ld.S
@@ -101,10 +101,31 @@ SECTIONS
      * The .bss section gets initialised to 0 at runtime.
      * Its base address must be 16-byte aligned.
      */
-    .bss : ALIGN(16) {
+    .bss (NOLOAD) : ALIGN(16) {
         __BSS_START__ = .;
         *(.bss*)
         *(COMMON)
+#if !USE_COHERENT_MEM
+        /*
+         * Bakery locks are stored in normal .bss memory
+         *
+         * Each lock's data is spread across multiple cache lines, one per CPU,
+         * but multiple locks can share the same cache line.
+         * The compiler will allocate enough memory for one CPU's bakery locks,
+         * the remaining cache lines are allocated by the linker script
+         */
+        . = ALIGN(CACHE_WRITEBACK_GRANULE);
+        __BAKERY_LOCK_START__ = .;
+        *(bakery_lock)
+        . = ALIGN(CACHE_WRITEBACK_GRANULE);
+        __PERCPU_BAKERY_LOCK_SIZE__ = . - __BAKERY_LOCK_START__;
+        . = . + (__PERCPU_BAKERY_LOCK_SIZE__ * (PLATFORM_CORE_COUNT - 1));
+        __BAKERY_LOCK_END__ = .;
+#ifdef PLAT_PERCPU_BAKERY_LOCK_SIZE
+    ASSERT(__PERCPU_BAKERY_LOCK_SIZE__ == PLAT_PERCPU_BAKERY_LOCK_SIZE,
+        "PLAT_PERCPU_BAKERY_LOCK_SIZE does not match bakery lock requirements");
+#endif
+#endif
         __BSS_END__ = .;
     } >RAM
 
@@ -126,6 +147,12 @@ SECTIONS
      */
     coherent_ram (NOLOAD) : ALIGN(4096) {
         __COHERENT_RAM_START__ = .;
+        /*
+         * Bakery locks are stored in coherent memory
+         *
+         * Each lock's data is contiguous and fully allocated by the compiler
+         */
+        *(bakery_lock)
         *(tzfw_coherent_mem)
         __COHERENT_RAM_END_UNALIGNED__ = .;
         /*
diff --git a/docs/firmware-design.md b/docs/firmware-design.md
index 18f634f4b3e47d9d8f72afe91893a7d297276441..41fb7c0d82c5c9ad59fb6139dda72aaa2da95e7d 100644
--- a/docs/firmware-design.md
+++ b/docs/firmware-design.md
@@ -1523,38 +1523,52 @@ approach described above.
 The below sections analyze the data structures allocated in the coherent memory
 region and the changes required to allocate them in normal memory.
 
-### PSCI Affinity map nodes
-
-The `psci_aff_map` data structure stores the hierarchial node information for
-each affinity level in the system including the PSCI states associated with them.
-By default, this data structure is allocated in the coherent memory region in
-the Trusted Firmware because it can be accessed by multiple CPUs, either with
-their caches enabled or disabled.
-
-	typedef struct aff_map_node {
-		unsigned long mpidr;
-		unsigned char ref_count;
-		unsigned char state;
-		unsigned char level;
-	#if USE_COHERENT_MEM
-		bakery_lock_t lock;
-	#else
-		unsigned char aff_map_index;
-	#endif
-	} aff_map_node_t;
+### Coherent memory usage in PSCI implementation
+
+The `psci_non_cpu_pd_nodes` data structure stores the platform's power domain
+tree information for state management of power domains. By default, this data
+structure is allocated in the coherent memory region in the Trusted Firmware
+because it can be accessed by multple CPUs, either with caches enabled or
+disabled.
+
+typedef struct non_cpu_pwr_domain_node {
+        /*
+         * Index of the first CPU power domain node level 0 which has this node
+         * as its parent.
+         */
+        unsigned int cpu_start_idx;
+
+        /*
+         * Number of CPU power domains which are siblings of the domain indexed
+         * by 'cpu_start_idx' i.e. all the domains in the range 'cpu_start_idx
+         * -> cpu_start_idx + ncpus' have this node as their parent.
+         */
+        unsigned int ncpus;
+
+        /*
+         * Index of the parent power domain node.
+         * TODO: Figure out whether to whether using pointer is more efficient.
+         */
+        unsigned int parent_node;
+
+        plat_local_state_t local_state;
+
+        unsigned char level;
+
+        /* For indexing the psci_lock array*/
+        unsigned char lock_index;
+} non_cpu_pd_node_t;
 
 In order to move this data structure to normal memory, the use of each of its
-fields must be analyzed. Fields like `mpidr` and `level` are only written once
-during cold boot. Hence removing them from coherent memory involves only doing
-a clean and invalidate of the cache lines after these fields are written.
-
-The fields `state` and `ref_count` can be concurrently accessed by multiple
-CPUs in different cache states. A Lamport's Bakery lock is used to ensure mutual
-exlusion to these fields. As a result, it is possible to move these fields out
-of coherent memory by performing software cache maintenance on them. The field
-`lock` is the bakery lock data structure when `USE_COHERENT_MEM` is enabled.
-The `aff_map_index` is used to identify the bakery lock when `USE_COHERENT_MEM`
-is disabled.
+fields must be analyzed. Fields like `cpu_start_idx`, `ncpus`, `parent_node`
+`level` and `lock_index` are only written once during cold boot. Hence removing
+them from coherent memory involves only doing a clean and invalidate of the
+cache lines after these fields are written.
+
+The field `local_state` can be concurrently accessed by multiple CPUs in
+different cache states. A Lamport's Bakery lock `psci_locks` is used to ensure
+mutual exlusion to this field and a clean and invalidate is needed after it
+is written.
 
 ### Bakery lock data
 
@@ -1563,9 +1577,13 @@ and is accessed by multiple CPUs with mismatched attributes. `bakery_lock_t` is
 defined as follows:
 
     typedef struct bakery_lock {
-        int owner;
-        volatile char entering[BAKERY_LOCK_MAX_CPUS];
-        volatile unsigned number[BAKERY_LOCK_MAX_CPUS];
+        /*
+         * The lock_data is a bit-field of 2 members:
+         * Bit[0]       : choosing. This field is set when the CPU is
+         *                choosing its bakery number.
+         * Bits[1 - 15] : number. This is the bakery number allocated.
+         */
+        volatile uint16_t lock_data[BAKERY_LOCK_MAX_CPUS];
     } bakery_lock_t;
 
 It is a characteristic of Lamport's Bakery algorithm that the volatile per-CPU
@@ -1589,17 +1607,14 @@ the update made by CPU0 as well.
 
 To use bakery locks when `USE_COHERENT_MEM` is disabled, the lock data structure
 has been redesigned. The changes utilise the characteristic of Lamport's Bakery
-algorithm mentioned earlier. The per-CPU fields of the new lock structure are
-aligned such that they are allocated on separate cache lines. The per-CPU data
-framework in Trusted Firmware is used to achieve this. This enables software to
+algorithm mentioned earlier. The bakery_lock structure only allocates the memory
+for a single CPU. The macro `DEFINE_BAKERY_LOCK` allocates all the bakery locks
+needed for a CPU into a section `bakery_lock`. The linker allocates the memory
+for other cores by using the total size allocated for the bakery_lock section
+and multiplying it with (PLATFORM_CORE_COUNT - 1). This enables software to
 perform software cache maintenance on the lock data structure without running
 into coherency issues associated with mismatched attributes.
 
-The per-CPU data framework enables consolidation of data structures on the
-fewest cache lines possible. This saves memory as compared to the scenario where
-each data structure is separately aligned to the cache line boundary to achieve
-the same effect.
-
 The bakery lock data structure `bakery_info_t` is defined for use when
 `USE_COHERENT_MEM` is disabled as follows:
 
@@ -1615,12 +1630,10 @@ The bakery lock data structure `bakery_info_t` is defined for use when
 
 The `bakery_info_t` represents a single per-CPU field of one lock and
 the combination of corresponding `bakery_info_t` structures for all CPUs in the
-system represents the complete bakery lock. It is embedded in the per-CPU
-data framework `cpu_data` as shown below:
+system represents the complete bakery lock. The view in memory for a system
+with n bakery locks are:
 
-      CPU0 cpu_data
-    ------------------
-    | ....           |
+    bakery_lock section start
     |----------------|
     | `bakery_info_t`| <-- Lock_0 per-CPU field
     |    Lock_0      |     for CPU0
@@ -1633,12 +1646,11 @@ data framework `cpu_data` as shown below:
     | `bakery_info_t`| <-- Lock_N per-CPU field
     |    Lock_N      |     for CPU0
     ------------------
-
-
-      CPU1 cpu_data
+    |    XXXXX       |
+    | Padding to     |
+    | next Cache WB  | <--- Calculate PERCPU_BAKERY_LOCK_SIZE, allocate
+    |  Granule       |       continuous memory for remaining CPUs.
     ------------------
-    | ....           |
-    |----------------|
     | `bakery_info_t`| <-- Lock_0 per-CPU field
     |    Lock_0      |     for CPU1
     |----------------|
@@ -1650,14 +1662,20 @@ data framework `cpu_data` as shown below:
     | `bakery_info_t`| <-- Lock_N per-CPU field
     |    Lock_N      |     for CPU1
     ------------------
+    |    XXXXX       |
+    | Padding to     |
+    | next Cache WB  |
+    |  Granule       |
+    ------------------
 
-Consider a system of 2 CPUs with 'N' bakery locks as shown above.  For an
+Consider a system of 2 CPUs with 'N' bakery locks as shown above. For an
 operation on Lock_N, the corresponding `bakery_info_t` in both CPU0 and CPU1
-`cpu_data` need to be fetched and appropriate cache operations need to be
-performed for each access.
+`bakery_lock` section need to be fetched and appropriate cache operations need
+to be performed for each access.
+
+On ARM Platforms, bakery locks are used in psci (`psci_locks`) and power controller
+driver (`arm_lock`).
 
-For multiple bakery locks, an array of `bakery_info_t` is declared in `cpu_data`
-and each lock is given an `id` to identify it in the array.
 
 ### Non Functional Impact of removing coherent memory
 
@@ -1680,10 +1698,9 @@ Juno ARM development platform.
 As mentioned earlier, almost a page of memory can be saved by disabling
 `USE_COHERENT_MEM`. Each platform needs to consider these trade-offs to decide
 whether coherent memory should be used. If a platform disables
-`USE_COHERENT_MEM` and needs to use bakery locks in the porting layer, it should
-reserve memory in `cpu_data` by defining the macro `PLAT_PCPU_DATA_SIZE` (see
-the [Porting Guide]). Refer to the reference platform code for examples.
-
+`USE_COHERENT_MEM` and needs to use bakery locks in the porting layer, it can
+optionally define macro `PLAT_PERCPU_BAKERY_LOCK_SIZE`  (see the [Porting
+Guide]). Refer to the reference platform code for examples.
 
 12.  Code Structure
 -------------------
diff --git a/docs/porting-guide.md b/docs/porting-guide.md
index 6846ddfe2c52738920097a82f498c9e6f718980a..04793a2df3cddce477c8779748b96d2222759a4d 100644
--- a/docs/porting-guide.md
+++ b/docs/porting-guide.md
@@ -76,21 +76,24 @@ mapped page tables, and enable both the instruction and data caches for each BL
 stage. In ARM standard platforms, each BL stage configures the MMU in
 the platform-specific architecture setup function, `blX_plat_arch_setup()`.
 
-If the build option `USE_COHERENT_MEM` is enabled, each platform must allocate a
+If the build option `USE_COHERENT_MEM` is enabled, each platform can allocate a
 block of identity mapped secure memory with Device-nGnRE attributes aligned to
-page boundary (4K) for each BL stage. This memory is identified by the section
-name `tzfw_coherent_mem` so that its possible for the firmware to place
-variables in it using the following C code directive:
+page boundary (4K) for each BL stage. All sections which allocate coherent
+memory are grouped under `coherent_ram`. For ex: Bakery locks are placed in a
+section identified by name `bakery_lock` inside `coherent_ram` so that its
+possible for the firmware to place variables in it using the following C code
+directive:
 
-    __attribute__ ((section("tzfw_coherent_mem")))
+    __attribute__ ((section("bakery_lock")))
 
 Or alternatively the following assembler code directive:
 
-    .section tzfw_coherent_mem
+    .section bakery_lock
 
-The `tzfw_coherent_mem` section is used to allocate any data structures that are
-accessed both when a CPU is executing with its MMU and caches enabled, and when
-it's running with its MMU and caches disabled. Examples are given below.
+The `coherent_ram` section is a sum of all sections like `bakery_lock` which are
+used to allocate any data structures that are accessed both when a CPU is
+executing with its MMU and caches enabled, and when it's running with its MMU
+and caches disabled. Examples are given below.
 
 The following variables, functions and constants must be defined by the platform
 for the firmware to work correctly.
@@ -1150,6 +1153,12 @@ of the system counter, which is retrieved from the first entry in the frequency
 modes table.
 
 
+*   **#define : PLAT_PERCPU_BAKERY_LOCK_SIZE** [optional]
+
+    It is used if the bakery locks are using normal memory. It defines the memory
+   (in bytes) to be allocated for the bakery locks and needs to be a multiple of
+   cache line size.
+
 3.3 Power State Coordination Interface (in BL3-1)
 ------------------------------------------------
 
diff --git a/include/bl31/services/psci.h b/include/bl31/services/psci.h
index 30a53ca27bf7b55ae1151847cf5eab84dbf5cacf..6298a4046a9c8ec241bf95018eff01f99039bcd0 100644
--- a/include/bl31/services/psci.h
+++ b/include/bl31/services/psci.h
@@ -251,9 +251,6 @@ typedef struct psci_cpu_data {
 
 	/* The local power state of this CPU */
 	plat_local_state_t local_state;
-#if !USE_COHERENT_MEM
-	bakery_info_t pcpu_bakery_info[PSCI_NUM_NON_CPU_PWR_DOMAINS];
-#endif
 } psci_cpu_data_t;
 
 /*******************************************************************************
diff --git a/include/lib/bakery_lock.h b/include/lib/bakery_lock.h
index 2e1afa21f3b694c1b217ecd9697e6aa7eddb540f..86adb9cb145b4d0fe0f4dd676a666e709dab6e9f 100644
--- a/include/lib/bakery_lock.h
+++ b/include/lib/bakery_lock.h
@@ -56,6 +56,11 @@
  * External bakery lock interface.
  ****************************************************************************/
 #if USE_COHERENT_MEM
+/*
+ * Bakery locks are stored in coherent memory
+ *
+ * Each lock's data is contiguous and fully allocated by the compiler
+ */
 
 typedef struct bakery_lock {
 	/*
@@ -67,12 +72,15 @@ typedef struct bakery_lock {
 	volatile uint16_t lock_data[BAKERY_LOCK_MAX_CPUS];
 } bakery_lock_t;
 
-void bakery_lock_init(bakery_lock_t *bakery);
-void bakery_lock_get(bakery_lock_t *bakery);
-void bakery_lock_release(bakery_lock_t *bakery);
-int bakery_lock_try(bakery_lock_t *bakery);
-
 #else
+/*
+ * Bakery locks are stored in normal .bss memory
+ *
+ * Each lock's data is spread across multiple cache lines, one per CPU,
+ * but multiple locks can share the same cache line.
+ * The compiler will allocate enough memory for one CPU's bakery locks,
+ * the remaining cache lines are allocated by the linker script
+ */
 
 typedef struct bakery_info {
 	/*
@@ -84,9 +92,19 @@ typedef struct bakery_info {
 	volatile uint16_t lock_data;
 } bakery_info_t;
 
-void bakery_lock_get(unsigned int id, unsigned int offset);
-void bakery_lock_release(unsigned int id, unsigned int offset);
+typedef bakery_info_t bakery_lock_t;
 
 #endif /* __USE_COHERENT_MEM__ */
+
+inline void bakery_lock_init(bakery_lock_t *bakery) {}
+void bakery_lock_get(bakery_lock_t *bakery);
+void bakery_lock_release(bakery_lock_t *bakery);
+
+#define DEFINE_BAKERY_LOCK(_name) bakery_lock_t _name \
+			__attribute__ ((section("bakery_lock")))
+
+#define DECLARE_BAKERY_LOCK(_name) extern bakery_lock_t _name
+
+
 #endif /* __ASSEMBLY__ */
 #endif /* __BAKERY_LOCK_H__ */
diff --git a/include/plat/arm/common/arm_def.h b/include/plat/arm/common/arm_def.h
index a22e64ab344f589a92e685d98edd8384f5c89c17..c236970ab3980300627ead0cca92de2c0ac79010 100644
--- a/include/plat/arm/common/arm_def.h
+++ b/include/plat/arm/common/arm_def.h
@@ -206,14 +206,6 @@
  */
 #define CACHE_WRITEBACK_GRANULE		(1 << ARM_CACHE_WRITEBACK_SHIFT)
 
-#if !USE_COHERENT_MEM
-/*
- * Size of the per-cpu data in bytes that should be reserved in the generic
- * per-cpu data structure for the ARM platform port.
- */
-#define PLAT_PCPU_DATA_SIZE		2
-#endif
-
 
 /*******************************************************************************
  * BL1 specific defines.
@@ -301,4 +293,10 @@
 #define TSP_IRQ_SEC_PHY_TIMER		ARM_IRQ_SEC_PHY_TIMER
 
 
+/*
+ * One cache line needed for bakery locks on ARM platforms
+ */
+#define PLAT_PERCPU_BAKERY_LOCK_SIZE		(1 * CACHE_WRITEBACK_GRANULE)
+
+
 #endif /* __ARM_DEF_H__ */
diff --git a/include/plat/arm/common/plat_arm.h b/include/plat/arm/common/plat_arm.h
index 823212cb3a356f0420c85ef8b3fd9664961a6a23..ad41f4f0a1829dc479f61fa4e7734029ebedeeef 100644
--- a/include/plat/arm/common/plat_arm.h
+++ b/include/plat/arm/common/plat_arm.h
@@ -71,14 +71,11 @@ void arm_configure_mmu_el3(unsigned long total_base,
 );
 
 #if IMAGE_BL31
-#if USE_COHERENT_MEM
-
 /*
  * Use this macro to instantiate lock before it is used in below
  * arm_lock_xxx() macros
  */
-#define ARM_INSTANTIATE_LOCK	bakery_lock_t arm_lock	\
-	__attribute__ ((section("tzfw_coherent_mem")));
+#define ARM_INSTANTIATE_LOCK	DEFINE_BAKERY_LOCK(arm_lock);
 
 /*
  * These are wrapper macros to the Coherent Memory Bakery Lock API.
@@ -89,58 +86,9 @@ void arm_configure_mmu_el3(unsigned long total_base,
 
 #else
 
-/*******************************************************************************
- * Constants to specify how many bakery locks this platform implements. These
- * are used if the platform chooses not to use coherent memory for bakery lock
- * data structures.
- ******************************************************************************/
-#define ARM_MAX_BAKERIES	1
-#define ARM_PWRC_BAKERY_ID	0
-
-/* Empty definition */
-#define ARM_INSTANTIATE_LOCK
-
-/*******************************************************************************
- * Definition of structure which holds platform specific per-cpu data. Currently
- * it holds only the bakery lock information for each cpu.
- ******************************************************************************/
-typedef struct arm_cpu_data {
-	bakery_info_t pcpu_bakery_info[ARM_MAX_BAKERIES];
-} arm_cpu_data_t;
-
-/* Macro to define the offset of bakery_info_t in arm_cpu_data_t */
-#define ARM_CPU_DATA_LOCK_OFFSET	__builtin_offsetof\
-					    (arm_cpu_data_t, pcpu_bakery_info)
-
-
-/*******************************************************************************
- * Helper macros for bakery lock api when using the above arm_cpu_data_t for
- * bakery lock data structures. It assumes that the bakery_info is at the
- * beginning of the platform specific per-cpu data.
- ******************************************************************************/
-#define arm_lock_init()		/* No init required */
-#define arm_lock_get()		bakery_lock_get(ARM_PWRC_BAKERY_ID,	\
-					CPU_DATA_PLAT_PCPU_OFFSET +	\
-					ARM_CPU_DATA_LOCK_OFFSET)
-#define arm_lock_release()	bakery_lock_release(ARM_PWRC_BAKERY_ID,	\
-					CPU_DATA_PLAT_PCPU_OFFSET +	\
-					ARM_CPU_DATA_LOCK_OFFSET)
-
 /*
- * Ensure that the size of the platform specific per-cpu data structure and
- * the size of the memory allocated in generic per-cpu data for the platform
- * are the same.
+ * Empty macros for all other BL stages other than BL3-1
  */
-CASSERT(PLAT_PCPU_DATA_SIZE == sizeof(arm_cpu_data_t),
-	arm_pcpu_data_size_mismatch);
-
-#endif /* USE_COHERENT_MEM */
-
-#else
-
-/*
-* Dummy macros for all other BL stages other than BL3-1
-*/
 #define ARM_INSTANTIATE_LOCK
 #define arm_lock_init()
 #define arm_lock_get()
diff --git a/lib/locks/bakery/bakery_lock_coherent.c b/lib/locks/bakery/bakery_lock_coherent.c
index 1c60dba78f01f6f62269b3ebc54d9fe47a0e87f5..f2212228b4a86d538ebe142f8ba2c9cd28e6a054 100644
--- a/lib/locks/bakery/bakery_lock_coherent.c
+++ b/lib/locks/bakery/bakery_lock_coherent.c
@@ -63,16 +63,6 @@
 	assert(entry < BAKERY_LOCK_MAX_CPUS);		\
 } while (0)
 
-/* Initialize Bakery Lock to reset all ticket values */
-void bakery_lock_init(bakery_lock_t *bakery)
-{
-	assert(bakery);
-
-	/* All ticket values need to be 0 */
-	memset(bakery, 0, sizeof(*bakery));
-}
-
-
 /* Obtain a ticket for a given CPU */
 static unsigned int bakery_get_ticket(bakery_lock_t *bakery, unsigned int me)
 {
diff --git a/lib/locks/bakery/bakery_lock_normal.c b/lib/locks/bakery/bakery_lock_normal.c
index 3ca76e0d9be88542d28b17c04ec8c9a843d49e68..45b870b9de9ecc10afd8666b128b93ded3a2e8fe 100644
--- a/lib/locks/bakery/bakery_lock_normal.c
+++ b/lib/locks/bakery/bakery_lock_normal.c
@@ -56,12 +56,29 @@
  * accesses regardless of status of address translation.
  */
 
-/* This macro assumes that the bakery_info array is located at the offset specified */
-#define get_my_bakery_info(offset, id)		\
-	(((bakery_info_t *) (((uint8_t *)_cpu_data()) + offset)) + id)
+#ifdef PLAT_PERCPU_BAKERY_LOCK_SIZE
+/*
+ * Verify that the platform defined value for the per-cpu space for bakery locks is
+ * a multiple of the cache line size, to prevent multiple CPUs writing to the same
+ * bakery lock cache line
+ *
+ * Using this value, if provided, rather than the linker generated value results in
+ * more efficient code
+ */
+CASSERT((PLAT_PERCPU_BAKERY_LOCK_SIZE & (CACHE_WRITEBACK_GRANULE - 1)) == 0, \
+	PLAT_PERCPU_BAKERY_LOCK_SIZE_not_cacheline_multiple);
+#define PERCPU_BAKERY_LOCK_SIZE (PLAT_PERCPU_BAKERY_LOCK_SIZE)
+#else
+/*
+ * Use the linker defined symbol which has evaluated the size reqiurement.
+ * This is not as efficient as using a platform defined constant
+ */
+extern void *__PERCPU_BAKERY_LOCK_SIZE__;
+#define PERCPU_BAKERY_LOCK_SIZE ((uintptr_t)&__PERCPU_BAKERY_LOCK_SIZE__)
+#endif
 
-#define get_bakery_info_by_index(offset, id, ix)	\
-	(((bakery_info_t *) (((uint8_t *)_cpu_data_by_index(ix)) + offset)) + id)
+#define get_bakery_info(cpu_ix, lock)	\
+	(bakery_info_t *)((uintptr_t)lock + cpu_ix * PERCPU_BAKERY_LOCK_SIZE)
 
 #define write_cache_op(addr, cached)	\
 				do {	\
@@ -73,7 +90,7 @@
 #define read_cache_op(addr, cached)	if (cached) \
 					    dccivac((uint64_t)addr)
 
-static unsigned int bakery_get_ticket(int id, unsigned int offset,
+static unsigned int bakery_get_ticket(bakery_lock_t *lock,
 						unsigned int me, int is_cached)
 {
 	unsigned int my_ticket, their_ticket;
@@ -84,7 +101,7 @@ static unsigned int bakery_get_ticket(int id, unsigned int offset,
 	 * Obtain a reference to the bakery information for this cpu and ensure
 	 * it is not NULL.
 	 */
-	my_bakery_info = get_my_bakery_info(offset, id);
+	my_bakery_info = get_bakery_info(me, lock);
 	assert(my_bakery_info);
 
 	/*
@@ -115,7 +132,7 @@ static unsigned int bakery_get_ticket(int id, unsigned int offset,
 		 * Get a reference to the other contender's bakery info and
 		 * ensure that a stale copy is not read.
 		 */
-		their_bakery_info = get_bakery_info_by_index(offset, id, they);
+		their_bakery_info = get_bakery_info(they, lock);
 		assert(their_bakery_info);
 
 		read_cache_op(their_bakery_info, is_cached);
@@ -141,7 +158,7 @@ static unsigned int bakery_get_ticket(int id, unsigned int offset,
 	return my_ticket;
 }
 
-void bakery_lock_get(unsigned int id, unsigned int offset)
+void bakery_lock_get(bakery_lock_t *lock)
 {
 	unsigned int they, me, is_cached;
 	unsigned int my_ticket, my_prio, their_ticket;
@@ -153,7 +170,7 @@ void bakery_lock_get(unsigned int id, unsigned int offset)
 	is_cached = read_sctlr_el3() & SCTLR_C_BIT;
 
 	/* Get a ticket */
-	my_ticket = bakery_get_ticket(id, offset, me, is_cached);
+	my_ticket = bakery_get_ticket(lock, me, is_cached);
 
 	/*
 	 * Now that we got our ticket, compute our priority value, then compare
@@ -168,7 +185,7 @@ void bakery_lock_get(unsigned int id, unsigned int offset)
 		 * Get a reference to the other contender's bakery info and
 		 * ensure that a stale copy is not read.
 		 */
-		their_bakery_info = get_bakery_info_by_index(offset, id, they);
+		their_bakery_info = get_bakery_info(they, lock);
 		assert(their_bakery_info);
 
 		/* Wait for the contender to get their ticket */
@@ -199,12 +216,12 @@ void bakery_lock_get(unsigned int id, unsigned int offset)
 	/* Lock acquired */
 }
 
-void bakery_lock_release(unsigned int id, unsigned int offset)
+void bakery_lock_release(bakery_lock_t *lock)
 {
 	bakery_info_t *my_bakery_info;
 	unsigned int is_cached = read_sctlr_el3() & SCTLR_C_BIT;
 
-	my_bakery_info = get_my_bakery_info(offset, id);
+	my_bakery_info = get_bakery_info(plat_my_core_pos(), lock);
 	assert(bakery_ticket_number(my_bakery_info->lock_data));
 
 	my_bakery_info->lock_data = 0;
diff --git a/plat/mediatek/mt8173/drivers/spm/spm.c b/plat/mediatek/mt8173/drivers/spm/spm.c
index f67daea034231538837c0b56e1e571e68a665f72..7c6d72bec23e2beee78abfd0f3ebfa60ca0cfcbd 100644
--- a/plat/mediatek/mt8173/drivers/spm/spm.c
+++ b/plat/mediatek/mt8173/drivers/spm/spm.c
@@ -53,7 +53,8 @@
 static int spm_dormant_sta = CPU_DORMANT_RESET;
 #endif
 
-static bakery_lock_t spm_lock __attribute__ ((section("tzfw_coherent_mem")));
+DEFINE_BAKERY_LOCK(spm_lock);
+
 static int spm_hotplug_ready __attribute__ ((section("tzfw_coherent_mem")));
 static int spm_mcdi_ready __attribute__ ((section("tzfw_coherent_mem")));
 static int spm_suspend_ready __attribute__ ((section("tzfw_coherent_mem")));
diff --git a/services/std_svc/psci/psci_common.c b/services/std_svc/psci/psci_common.c
index e12df04b4b466d71a2cfcc4f47302e0f1f23cfff..733269527f13c597cfc81fd28e255de92f70dce3 100644
--- a/services/std_svc/psci/psci_common.c
+++ b/services/std_svc/psci/psci_common.c
@@ -78,6 +78,8 @@ __attribute__ ((section("tzfw_coherent_mem")))
 #endif
 ;
 
+DEFINE_BAKERY_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]);
+
 cpu_pd_node_t psci_cpu_pd_nodes[PLATFORM_CORE_COUNT];
 
 /*******************************************************************************
diff --git a/services/std_svc/psci/psci_private.h b/services/std_svc/psci/psci_private.h
index 9b55d9f395949c2099135fb48699e18849540c61..8c028a7352809582db06000748eed1bd52321846 100644
--- a/services/std_svc/psci/psci_private.h
+++ b/services/std_svc/psci/psci_private.h
@@ -42,23 +42,12 @@
  * The following helper macros abstract the interface to the Bakery
  * Lock API.
  */
-#if USE_COHERENT_MEM
-#define psci_lock_init(non_cpu_pd_node, idx)	\
-	bakery_lock_init(&(non_cpu_pd_node)[(idx)].lock)
-#define psci_lock_get(non_cpu_pd_node)		\
-	bakery_lock_get(&((non_cpu_pd_node)->lock))
-#define psci_lock_release(non_cpu_pd_node)	\
-	bakery_lock_release(&((non_cpu_pd_node)->lock))
-#else
 #define psci_lock_init(non_cpu_pd_node, idx)			\
 	((non_cpu_pd_node)[(idx)].lock_index = (idx))
 #define psci_lock_get(non_cpu_pd_node)				\
-	bakery_lock_get((non_cpu_pd_node)->lock_index, 		\
-			CPU_DATA_PSCI_LOCK_OFFSET)
+	bakery_lock_get(&psci_locks[(non_cpu_pd_node)->lock_index])
 #define psci_lock_release(non_cpu_pd_node)			\
-	bakery_lock_release((non_cpu_pd_node)->lock_index,	\
-			     CPU_DATA_PSCI_LOCK_OFFSET)
-#endif
+	bakery_lock_release(&psci_locks[(non_cpu_pd_node)->lock_index])
 
 /*
  * The PSCI capability which are provided by the generic code but does not
@@ -140,12 +129,9 @@ typedef struct non_cpu_pwr_domain_node {
 	plat_local_state_t local_state;
 
 	unsigned char level;
-#if USE_COHERENT_MEM
-	bakery_lock_t lock;
-#else
-	/* For indexing the bakery_info array in per CPU data */
+
+	/* For indexing the psci_lock array*/
 	unsigned char lock_index;
-#endif
 } non_cpu_pd_node_t;
 
 typedef struct cpu_pwr_domain_node {
@@ -174,6 +160,9 @@ extern non_cpu_pd_node_t psci_non_cpu_pd_nodes[PSCI_NUM_NON_CPU_PWR_DOMAINS];
 extern cpu_pd_node_t psci_cpu_pd_nodes[PLATFORM_CORE_COUNT];
 extern unsigned int psci_caps;
 
+/* One bakery lock is required for each non-cpu power domain */
+DECLARE_BAKERY_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]);
+
 /*******************************************************************************
  * SPD's power management hooks registered with PSCI
  ******************************************************************************/
diff --git a/services/std_svc/psci/psci_setup.c b/services/std_svc/psci/psci_setup.c
index 94fe630c9928062ff4e2b4ea6cbd0a7332529c1e..7a80187398d07ba2afdbeda3dc5002856d7bc44d 100644
--- a/services/std_svc/psci/psci_setup.c
+++ b/services/std_svc/psci/psci_setup.c
@@ -181,12 +181,6 @@ static void populate_power_domain_tree(const unsigned char *topology)
 
 	/* Validate the sanity of array exported by the platform */
 	assert(j == PLATFORM_CORE_COUNT);
-
-#if !USE_COHERENT_MEM
-	/* Flush the non CPU power domain data to memory */
-	flush_dcache_range((uintptr_t) &psci_non_cpu_pd_nodes,
-			   sizeof(psci_non_cpu_pd_nodes));
-#endif
 }
 
 /*******************************************************************************