diff --git a/Makefile b/Makefile
index c82c7a20582ab3e486aa58b09758b62d1c5358a2..6ad4b4d7b605ea1e07065af83a1555d449263099 100644
--- a/Makefile
+++ b/Makefile
@@ -63,6 +63,8 @@ ARM_GIC_ARCH		:=	2
 # Flag used to indicate if ASM_ASSERTION should be enabled for the build.
 # This defaults to being present in DEBUG builds only.
 ASM_ASSERTION		:=	${DEBUG}
+# Build option to choose whether Trusted firmware uses Coherent memory or not.
+USE_COHERENT_MEM	:=	1
 # Default FIP file name
 FIP_NAME		:= fip.bin
 
@@ -230,6 +232,10 @@ $(eval $(call add_define,ASM_ASSERTION))
 # Process LOG_LEVEL flag
 $(eval $(call add_define,LOG_LEVEL))
 
+# Process USE_COHERENT_MEM flag
+$(eval $(call assert_boolean,USE_COHERENT_MEM))
+$(eval $(call add_define,USE_COHERENT_MEM))
+
 ASFLAGS			+= 	-nostdinc -ffreestanding -Wa,--fatal-warnings	\
 				-Werror -Wmissing-include-dirs			\
 				-mgeneral-regs-only -D__ASSEMBLY__		\
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 82330c11eb5a13acc9e25112975ecdb33cbfcbc2..cfc6292198d008f9bb737709aaad7e16f9e69b47 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -131,9 +131,11 @@ func bl1_entrypoint
 	ldr	x1, =__BSS_SIZE__
 	bl	zeromem16
 
+#if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
 	bl	zeromem16
+#endif
 
 	ldr	x0, =__DATA_RAM_START__
 	ldr	x1, =__DATA_ROM_START__
diff --git a/bl1/bl1.ld.S b/bl1/bl1.ld.S
index 007149b50c2087878b4bf58de27bbfd4fb883a59..d682384a6cb70fde87aa5d98e2a3ea683d9f18de 100644
--- a/bl1/bl1.ld.S
+++ b/bl1/bl1.ld.S
@@ -107,6 +107,7 @@ SECTIONS
         *(xlat_table)
     } >RAM
 
+#if USE_COHERENT_MEM
     /*
      * The base address of the coherent memory section must be page-aligned (4K)
      * to guarantee that the coherent data are stored on their own pages and
@@ -125,6 +126,7 @@ SECTIONS
         . = NEXT(4096);
         __COHERENT_RAM_END__ = .;
     } >RAM
+#endif
 
     __BL1_RAM_START__ = ADDR(.data);
     __BL1_RAM_END__ = .;
@@ -140,8 +142,10 @@ SECTIONS
 
     __BSS_SIZE__ = SIZEOF(.bss);
 
+#if USE_COHERENT_MEM
     __COHERENT_RAM_UNALIGNED_SIZE__ =
         __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
+#endif
 
     ASSERT(. <= BL1_RW_LIMIT, "BL1's RW section has exceeded its limit.")
 }
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index 2f058da9fc9e2f586d315e39ed1d98eb575b0b2b..499dc3737ab2e0d9b78f30f1cce0114467836b2b 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -91,9 +91,11 @@ func bl2_entrypoint
 	ldr	x1, =__BSS_SIZE__
 	bl	zeromem16
 
+#if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
 	bl	zeromem16
+#endif
 
 	/* --------------------------------------------
 	 * Allocate a stack whose memory will be marked
diff --git a/bl2/bl2.ld.S b/bl2/bl2.ld.S
index 65304de71af5c75d6510d68cce5d3672ee7cfc6f..99333391b7a351f994bc143cd25937010d70968c 100644
--- a/bl2/bl2.ld.S
+++ b/bl2/bl2.ld.S
@@ -93,6 +93,7 @@ SECTIONS
         *(xlat_table)
     } >RAM
 
+#if USE_COHERENT_MEM
     /*
      * The base address of the coherent memory section must be page-aligned (4K)
      * to guarantee that the coherent data are stored on their own pages and
@@ -111,12 +112,16 @@ SECTIONS
         . = NEXT(4096);
         __COHERENT_RAM_END__ = .;
     } >RAM
+#endif
 
     __BL2_END__ = .;
 
     __BSS_SIZE__ = SIZEOF(.bss);
+
+#if USE_COHERENT_MEM
     __COHERENT_RAM_UNALIGNED_SIZE__ =
         __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
+#endif
 
     ASSERT(. <= BL2_LIMIT, "BL2 image has exceeded its limit.")
 }
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 04063e1c67c0b23df4b85cc342c8a190de4e87cd..b786b29d8fc538724daaf4df36b7f2c8d4544d94 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -149,9 +149,11 @@ func bl31_entrypoint
 	ldr	x1, =__BSS_SIZE__
 	bl	zeromem16
 
+#if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
 	bl	zeromem16
+#endif
 
 	/* ---------------------------------------------
 	 * Initialize the cpu_ops pointer.
diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S
index 124be85ca94824dae30522439b109383331b7988..3327f31653ee55399df12caf89dcf78806d652bb 100644
--- a/bl31/bl31.ld.S
+++ b/bl31/bl31.ld.S
@@ -117,6 +117,7 @@ SECTIONS
         *(xlat_table)
     } >RAM
 
+#if USE_COHERENT_MEM
     /*
      * The base address of the coherent memory section must be page-aligned (4K)
      * to guarantee that the coherent data are stored on their own pages and
@@ -135,12 +136,15 @@ SECTIONS
         . = NEXT(4096);
         __COHERENT_RAM_END__ = .;
     } >RAM
+#endif
 
     __BL31_END__ = .;
 
     __BSS_SIZE__ = SIZEOF(.bss);
+#if USE_COHERENT_MEM
     __COHERENT_RAM_UNALIGNED_SIZE__ =
         __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
+#endif
 
     ASSERT(. <= BL31_LIMIT, "BL3-1 image has exceeded its limit.")
 }
diff --git a/bl31/bl31.mk b/bl31/bl31.mk
index f53a41ff7f49c4fec1a4e8fd47a29d737291824c..4c25a60a3ac4d419e4b01b9d8ef662bb698ab5af 100644
--- a/bl31/bl31.mk
+++ b/bl31/bl31.mk
@@ -40,7 +40,6 @@ BL31_SOURCES		+=	bl31/bl31_main.c				\
 				bl31/aarch64/runtime_exceptions.S		\
 				bl31/aarch64/crash_reporting.S			\
 				lib/cpus/aarch64/cpu_helpers.S			\
-				lib/locks/bakery/bakery_lock.c			\
 				lib/locks/exclusive/spinlock.S			\
 				services/std_svc/std_svc_setup.c		\
 				services/std_svc/psci/psci_afflvl_off.c		\
@@ -53,6 +52,12 @@ BL31_SOURCES		+=	bl31/bl31_main.c				\
 				services/std_svc/psci/psci_setup.c		\
 				services/std_svc/psci/psci_system_off.c
 
+ifeq (${USE_COHERENT_MEM}, 1)
+BL31_SOURCES		+=	lib/locks/bakery/bakery_lock_coherent.c
+else
+BL31_SOURCES		+=	lib/locks/bakery/bakery_lock_normal.c
+endif
+
 BL31_LINKERFILE		:=	bl31/bl31.ld.S
 
 # Flag used by the generic interrupt management framework to  determine if
diff --git a/bl32/tsp/aarch64/tsp_entrypoint.S b/bl32/tsp/aarch64/tsp_entrypoint.S
index 1cda1653aa45570240af81ce7557de417297d040..2714282beb8d780a49552a762f9633cef42d26bb 100644
--- a/bl32/tsp/aarch64/tsp_entrypoint.S
+++ b/bl32/tsp/aarch64/tsp_entrypoint.S
@@ -108,9 +108,11 @@ func tsp_entrypoint
 	ldr	x1, =__BSS_SIZE__
 	bl	zeromem16
 
+#if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
 	bl	zeromem16
+#endif
 
 	/* --------------------------------------------
 	 * Allocate a stack whose memory will be marked
diff --git a/bl32/tsp/tsp.ld.S b/bl32/tsp/tsp.ld.S
index 5d7ffa1193f91c3460fd92c00f3c5e0fee1cddb1..d411ad021219dfba6566abedfc2cc2874b38c928 100644
--- a/bl32/tsp/tsp.ld.S
+++ b/bl32/tsp/tsp.ld.S
@@ -98,6 +98,7 @@ SECTIONS
         *(xlat_table)
     } >RAM
 
+#if USE_COHERENT_MEM
     /*
      * The base address of the coherent memory section must be page-aligned (4K)
      * to guarantee that the coherent data are stored on their own pages and
@@ -116,12 +117,15 @@ SECTIONS
         . = NEXT(4096);
         __COHERENT_RAM_END__ = .;
     } >RAM
+#endif
 
     __BL32_END__ = .;
 
     __BSS_SIZE__ = SIZEOF(.bss);
+#if USE_COHERENT_MEM
     __COHERENT_RAM_UNALIGNED_SIZE__ =
         __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
+#endif
 
     ASSERT(. <= BL32_LIMIT, "BL3-2 image has exceeded its limit.")
 }
diff --git a/bl32/tsp/tsp_main.c b/bl32/tsp/tsp_main.c
index 193ba29b855430acb0e65eef2b8e6fd0debe47cb..2eaca7c9aacd9a275d7d6c4ef9ba645c45f0a96c 100644
--- a/bl32/tsp/tsp_main.c
+++ b/bl32/tsp/tsp_main.c
@@ -43,7 +43,7 @@
  * of trusted SRAM
  ******************************************************************************/
 extern unsigned long __RO_START__;
-extern unsigned long __COHERENT_RAM_END__;
+extern unsigned long __BL32_END__;
 
 /*******************************************************************************
  * Lock to control access to the console
@@ -63,11 +63,11 @@ work_statistics_t tsp_stats[PLATFORM_CORE_COUNT];
 
 /*******************************************************************************
  * The BL32 memory footprint starts with an RO sections and ends
- * with a section for coherent RAM. Use it to find the memory size
+ * with the linker symbol __BL32_END__. Use it to find the memory size
  ******************************************************************************/
 #define BL32_TOTAL_BASE (unsigned long)(&__RO_START__)
 
-#define BL32_TOTAL_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#define BL32_TOTAL_LIMIT (unsigned long)(&__BL32_END__)
 
 static tsp_args_t *set_smc_args(uint64_t arg0,
 			     uint64_t arg1,
diff --git a/docs/firmware-design.md b/docs/firmware-design.md
index 41aaf7f2dc5c76d63414adb542bb086f4c395073..774ea436cf566ba8eb2cbd52b645ec130da4ae7e 100644
--- a/docs/firmware-design.md
+++ b/docs/firmware-design.md
@@ -12,8 +12,9 @@ Contents :
 7.  [CPU specific operations framework](#7--cpu-specific-operations-framework)
 8.  [Memory layout of BL images](#8-memory-layout-of-bl-images)
 9.  [Firmware Image Package (FIP)](#9--firmware-image-package-fip)
-10. [Code Structure](#10--code-structure)
-11. [References](#11--references)
+10. [Use of coherent memory in Trusted Firmware](#10--use-of-coherent-memory-in-trusted-firmware)
+11. [Code Structure](#11--code-structure)
+12. [References](#12--references)
 
 
 1.  Introduction
@@ -368,10 +369,10 @@ level implementation of the generic timer through the memory mapped interface.
     `ON`; any other cluster is `OFF`. BL3-1 initializes the data structures that
     implement the state machine, including the locks that protect them. BL3-1
     accesses the state of a CPU or cluster immediately after reset and before
-    the MMU is enabled in the warm boot path. It is not currently possible to
-    use 'exclusive' based spinlocks, therefore BL3-1 uses locks based on
-    Lamport's Bakery algorithm instead. BL3-1 allocates these locks in device
-    memory. They are accessible irrespective of MMU state.
+    the data cache is enabled in the warm boot path. It is not currently
+    possible to use 'exclusive' based spinlocks, therefore BL3-1 uses locks
+    based on Lamport's Bakery algorithm instead. BL3-1 allocates these locks in
+    device memory by default.
 
 *   Runtime services initialization:
 
@@ -1127,9 +1128,10 @@ this purpose:
 * `__BSS_START__` This address must be aligned on a 16-byte boundary.
 * `__BSS_SIZE__`
 
-Similarly, the coherent memory section must be zero-initialised. Also, the MMU
-setup code needs to know the extents of this section to set the right memory
-attributes for it. The following linker symbols are defined for this purpose:
+Similarly, the coherent memory section (if enabled) must be zero-initialised.
+Also, the MMU setup code needs to know the extents of this section to set the
+right memory attributes for it. The following linker symbols are defined for
+this purpose:
 
 * `__COHERENT_RAM_START__` This address must be aligned on a page-size boundary.
 * `__COHERENT_RAM_END__` This address must be aligned on a page-size boundary.
@@ -1443,7 +1445,208 @@ Currently the FVP's policy only allows loading of a known set of images. The
 platform policy can be modified to allow additional images.
 
 
-10.  Code Structure
+10. Use of coherent memory in Trusted Firmware
+----------------------------------------------
+
+There might be loss of coherency when physical memory with mismatched
+shareability, cacheability and memory attributes is accessed by multiple CPUs
+(refer to section B2.9 of [ARM ARM] for more details). This possibility occurs
+in Trusted Firmware during power up/down sequences when coherency, MMU and
+caches are turned on/off incrementally.
+
+Trusted Firmware defines coherent memory as a region of memory with Device
+nGnRE attributes in the translation tables. The translation granule size in
+Trusted Firmware is 4KB. This is the smallest possible size of the coherent
+memory region.
+
+By default, all data structures which are susceptible to accesses with
+mismatched attributes from various CPUs are allocated in a coherent memory
+region (refer to section 2.1 of [Porting Guide]). The coherent memory region
+accesses are Outer Shareable, non-cacheable and they can be accessed
+with the Device nGnRE attributes when the MMU is turned on. Hence, at the
+expense of at least an extra page of memory, Trusted Firmware is able to work
+around coherency issues due to mismatched memory attributes.
+
+The alternative to the above approach is to allocate the susceptible data
+structures in Normal WriteBack WriteAllocate Inner shareable memory. This
+approach requires the data structures to be designed so that it is possible to
+work around the issue of mismatched memory attributes by performing software
+cache maintenance on them.
+
+### Disabling the use of coherent memory in Trusted Firmware
+
+It might be desirable to avoid the cost of allocating coherent memory on
+platforms which are memory constrained. Trusted Firmware enables inclusion of
+coherent memory in firmware images through the build flag `USE_COHERENT_MEM`.
+This flag is enabled by default. It can be disabled to choose the second
+approach described above.
+
+The below sections analyze the data structures allocated in the coherent memory
+region and the changes required to allocate them in normal memory.
+
+### PSCI Affinity map nodes
+
+The `psci_aff_map` data structure stores the hierarchial node information for
+each affinity level in the system including the PSCI states associated with them.
+By default, this data structure is allocated in the coherent memory region in
+the Trusted Firmware because it can be accessed by multiple CPUs, either with
+their caches enabled or disabled.
+
+	typedef struct aff_map_node {
+		unsigned long mpidr;
+		unsigned char ref_count;
+		unsigned char state;
+		unsigned char level;
+	#if USE_COHERENT_MEM
+		bakery_lock_t lock;
+	#else
+		unsigned char aff_map_index;
+	#endif
+	} aff_map_node_t;
+
+In order to move this data structure to normal memory, the use of each of its
+fields must be analyzed. Fields like `mpidr` and `level` are only written once
+during cold boot. Hence removing them from coherent memory involves only doing
+a clean and invalidate of the cache lines after these fields are written.
+
+The fields `state` and `ref_count` can be concurrently accessed by multiple
+CPUs in different cache states. A Lamport's Bakery lock is used to ensure mutual
+exlusion to these fields. As a result, it is possible to move these fields out
+of coherent memory by performing software cache maintenance on them. The field
+`lock` is the bakery lock data structure when `USE_COHERENT_MEM` is enabled.
+The `aff_map_index` is used to identify the bakery lock when `USE_COHERENT_MEM`
+is disabled.
+
+### Bakery lock data
+
+The bakery lock data structure `bakery_lock_t` is allocated in coherent memory
+and is accessed by multiple CPUs with mismatched attributes. `bakery_lock_t` is
+defined as follows:
+
+    typedef struct bakery_lock {
+        int owner;
+        volatile char entering[BAKERY_LOCK_MAX_CPUS];
+        volatile unsigned number[BAKERY_LOCK_MAX_CPUS];
+    } bakery_lock_t;
+
+It is a characteristic of Lamport's Bakery algorithm that the volatile per-CPU
+fields can be read by all CPUs but only written to by the owning CPU.
+
+Depending upon the data cache line size, the per-CPU fields of the
+`bakery_lock_t` structure for multiple CPUs may exist on a single cache line.
+These per-CPU fields can be read and written during lock contention by multiple
+CPUs with mismatched memory attributes. Since these fields are a part of the
+lock implementation, they do not have access to any other locking primitive to
+safeguard against the resulting coherency issues. As a result, simple software
+cache maintenance is not enough to allocate them in coherent memory. Consider
+the following example.
+
+CPU0 updates its per-CPU field with data cache enabled. This write updates a
+local cache line which contains a copy of the fields for other CPUs as well. Now
+CPU1 updates its per-CPU field of the `bakery_lock_t` structure with data cache
+disabled. CPU1 then issues a DCIVAC operation to invalidate any stale copies of
+its field in any other cache line in the system. This operation will invalidate
+the update made by CPU0 as well.
+
+To use bakery locks when `USE_COHERENT_MEM` is disabled, the lock data structure
+has been redesigned. The changes utilise the characteristic of Lamport's Bakery
+algorithm mentioned earlier. The per-CPU fields of the new lock structure are
+aligned such that they are allocated on separate cache lines. The per-CPU data
+framework in Trusted Firmware is used to achieve this. This enables software to
+perform software cache maintenance on the lock data structure without running
+into coherency issues associated with mismatched attributes.
+
+The per-CPU data framework enables consolidation of data structures on the
+fewest cache lines possible. This saves memory as compared to the scenario where
+each data structure is separately aligned to the cache line boundary to achieve
+the same effect.
+
+The bakery lock data structure `bakery_info_t` is defined for use when
+`USE_COHERENT_MEM` is disabled as follows:
+
+    typedef struct bakery_info {
+        /*
+         * The lock_data is a bit-field of 2 members:
+         * Bit[0]       : choosing. This field is set when the CPU is
+         *                choosing its bakery number.
+         * Bits[1 - 15] : number. This is the bakery number allocated.
+         */
+         volatile uint16_t lock_data;
+    } bakery_info_t;
+
+The `bakery_info_t` represents a single per-CPU field of one lock and
+the combination of corresponding `bakery_info_t` structures for all CPUs in the
+system represents the complete bakery lock. It is embedded in the per-CPU
+data framework `cpu_data` as shown below:
+
+      CPU0 cpu_data
+    ------------------
+    | ....           |
+    |----------------|
+    | `bakery_info_t`| <-- Lock_0 per-CPU field
+    |    Lock_0      |     for CPU0
+    |----------------|
+    | `bakery_info_t`| <-- Lock_1 per-CPU field
+    |    Lock_1      |     for CPU0
+    |----------------|
+    | ....           |
+    |----------------|
+    | `bakery_info_t`| <-- Lock_N per-CPU field
+    |    Lock_N      |     for CPU0
+    ------------------
+
+
+      CPU1 cpu_data
+    ------------------
+    | ....           |
+    |----------------|
+    | `bakery_info_t`| <-- Lock_0 per-CPU field
+    |    Lock_0      |     for CPU1
+    |----------------|
+    | `bakery_info_t`| <-- Lock_1 per-CPU field
+    |    Lock_1      |     for CPU1
+    |----------------|
+    | ....           |
+    |----------------|
+    | `bakery_info_t`| <-- Lock_N per-CPU field
+    |    Lock_N      |     for CPU1
+    ------------------
+
+Consider a system of 2 CPUs with 'N' bakery locks as shown above.  For an
+operation on Lock_N, the corresponding `bakery_info_t` in both CPU0 and CPU1
+`cpu_data` need to be fetched and appropriate cache operations need to be
+performed for each access.
+
+For multiple bakery locks, an array of `bakery_info_t` is declared in `cpu_data`
+and each lock is given an `id` to identify it in the array.
+
+### Non Functional Impact of removing coherent memory
+
+Removal of the coherent memory region leads to the additional software overhead
+of performing cache maintenance for the affected data structures. However, since
+the memory where the data structures are allocated is cacheable, the overhead is
+mostly mitigated by an increase in performance.
+
+There is however a performance impact for bakery locks, due to:
+*   Additional cache maintenance operations, and
+*   Multiple cache line reads for each lock operation, since the bakery locks
+    for each CPU are distributed across different cache lines.
+
+The implementation has been optimized to mimimize this additional overhead.
+Measurements indicate that when bakery locks are allocated in Normal memory, the
+minimum latency of acquiring a lock is on an average 3-4 micro seconds whereas
+in Device memory the same is 2 micro seconds. The measurements were done on the
+Juno ARM development platform.
+
+As mentioned earlier, almost a page of memory can be saved by disabling
+`USE_COHERENT_MEM`. Each platform needs to consider these trade-offs to decide
+whether coherent memory should be used. If a platform disables
+`USE_COHERENT_MEM` and needs to use bakery locks in the porting layer, it should
+reserve memory in `cpu_data` by defining the macro `PLAT_PCPU_DATA_SIZE` (see
+the [Porting Guide]). Refer to the reference platform code for examples.
+
+
+11.  Code Structure
 -------------------
 
 Trusted Firmware code is logically divided between the three boot loader
@@ -1488,7 +1691,7 @@ FDTs provide a description of the hardware platform and are used by the Linux
 kernel at boot time. These can be found in the `fdts` directory.
 
 
-11.  References
+12.  References
 ---------------
 
 1.  Trusted Board Boot Requirements CLIENT PDD (ARM DEN 0006B-5). Available
@@ -1504,7 +1707,7 @@ kernel at boot time. These can be found in the `fdts` directory.
 
 _Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved._
 
-
+[ARM ARM]:          http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.e/index.html "ARMv8-A Reference Manual (ARM DDI0487A.E)"
 [PSCI]:             http://infocenter.arm.com/help/topic/com.arm.doc.den0022b/index.html "Power State Coordination Interface PDD (ARM DEN 0022B.b)"
 [SMCCC]:            http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html "SMC Calling Convention PDD (ARM DEN 0028A)"
 [UUID]:             https://tools.ietf.org/rfc/rfc4122.txt "A Universally Unique IDentifier (UUID) URN Namespace"
diff --git a/docs/porting-guide.md b/docs/porting-guide.md
index 3855ca7b03d63c16b4ce2139a1bd7746388fa91f..3d5e66fb53d15e1ce074f7d363bf00fee3cb66a6 100644
--- a/docs/porting-guide.md
+++ b/docs/porting-guide.md
@@ -63,11 +63,11 @@ mapped page tables, and enable both the instruction and data caches for each BL
 stage. In the ARM FVP port, each BL stage configures the MMU in its platform-
 specific architecture setup function, for example `blX_plat_arch_setup()`.
 
-Each platform must allocate a block of identity mapped secure memory with
-Device-nGnRE attributes aligned to page boundary (4K) for each BL stage. This
-memory is identified by the section name `tzfw_coherent_mem` so that its
-possible for the firmware to place variables in it using the following C code
-directive:
+If the build option `USE_COHERENT_MEM` is enabled, each platform must allocate a
+block of identity mapped secure memory with Device-nGnRE attributes aligned to
+page boundary (4K) for each BL stage. This memory is identified by the section
+name `tzfw_coherent_mem` so that its possible for the firmware to place
+variables in it using the following C code directive:
 
     __attribute__ ((section("tzfw_coherent_mem")))
 
@@ -246,6 +246,17 @@ must also be defined:
     entities than this value using `io_open()` will fail with
     IO_RESOURCES_EXHAUSTED.
 
+If the platform needs to allocate data within the per-cpu data framework in
+BL3-1, it should define the following macro. Currently this is only required if
+the platform decides not to use the coherent memory section by undefining the
+USE_COHERENT_MEM build flag. In this case, the framework allocates the required
+memory within the the per-cpu data to minimize wastage.
+
+*   **#define : PLAT_PCPU_DATA_SIZE**
+
+    Defines the memory (in bytes) to be reserved within the per-cpu data
+    structure for use by the platform layer.
+
 The following constants are optional. They should be defined when the platform
 memory layout implies some image overlaying like on FVP.
 
diff --git a/docs/user-guide.md b/docs/user-guide.md
index b33c4c0c3a308e081951685f6b757dae89ba5556..5ad44a898abab679773c65aef601c71d95dea62f 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -245,6 +245,12 @@ performed.
     synchronous method) or 1 (BL3-2 is initialized using asynchronous method).
     Default is 0.
 
+*   `USE_COHERENT_MEM`: This flag determines whether to include the coherent
+    memory region in the BL memory map or not (see "Use of Coherent memory in
+    Trusted Firmware" section in [Firmware Design]). It can take the value 1
+    (Coherent memory region is included) or 0 (Coherent memory region is
+    excluded). Default is 1.
+
 #### FVP specific build options
 
 *   `FVP_TSP_RAM_LOCATION`: location of the TSP binary. Options:
diff --git a/include/bl31/cpu_data.h b/include/bl31/cpu_data.h
index c886e2b4ed69f5769cae5055cb0361baed4d2488..1926e292a9752b15547238f4c08ecd75555e5ee0 100644
--- a/include/bl31/cpu_data.h
+++ b/include/bl31/cpu_data.h
@@ -32,7 +32,7 @@
 #define __CPU_DATA_H__
 
 /* Offsets for the cpu_data structure */
-#define CPU_DATA_CRASH_BUF_OFFSET	0x20
+#define CPU_DATA_CRASH_BUF_OFFSET	0x18
 #if CRASH_REPORTING
 #define CPU_DATA_LOG2SIZE		7
 #else
@@ -45,10 +45,20 @@
 #ifndef __ASSEMBLY__
 
 #include <arch_helpers.h>
+#include <cassert.h>
 #include <platform_def.h>
 #include <psci.h>
 #include <stdint.h>
 
+/* Offsets for the cpu_data structure */
+#define CPU_DATA_PSCI_LOCK_OFFSET	__builtin_offsetof\
+		(cpu_data_t, psci_svc_cpu_data.pcpu_bakery_info)
+
+#if PLAT_PCPU_DATA_SIZE
+#define CPU_DATA_PLAT_PCPU_OFFSET	__builtin_offsetof\
+		(cpu_data_t, platform_cpu_data)
+#endif
+
 /*******************************************************************************
  * Function & variable prototypes
  ******************************************************************************/
@@ -69,9 +79,12 @@
 typedef struct cpu_data {
 	void *cpu_context[2];
 	uint64_t cpu_ops_ptr;
-	struct psci_cpu_data psci_svc_cpu_data;
 #if CRASH_REPORTING
 	uint64_t crash_buf[CPU_DATA_CRASH_BUF_SIZE >> 3];
+#endif
+	struct psci_cpu_data psci_svc_cpu_data;
+#if PLAT_PCPU_DATA_SIZE
+	uint8_t platform_cpu_data[PLAT_PCPU_DATA_SIZE];
 #endif
 } __aligned(CACHE_WRITEBACK_GRANULE) cpu_data_t;
 
diff --git a/include/bl31/services/psci.h b/include/bl31/services/psci.h
index 6c23f1bf878d100df92c304e31a1d0d3a0a0769a..dc6cc04c46e827fd540eb187fd9a9d9df274903b 100644
--- a/include/bl31/services/psci.h
+++ b/include/bl31/services/psci.h
@@ -31,6 +31,17 @@
 #ifndef __PSCI_H__
 #define __PSCI_H__
 
+#include <bakery_lock.h>
+#include <platform_def.h>	/* for PLATFORM_NUM_AFFS */
+
+/*******************************************************************************
+ * Number of affinity instances whose state this psci imp. can track
+ ******************************************************************************/
+#ifdef PLATFORM_NUM_AFFS
+#define PSCI_NUM_AFFS		PLATFORM_NUM_AFFS
+#else
+#define PSCI_NUM_AFFS		(2 * PLATFORM_CORE_COUNT)
+#endif
 
 /*******************************************************************************
  * Defines for runtime services func ids
@@ -140,6 +151,9 @@ typedef struct psci_cpu_data {
 	uint32_t power_state;
 	uint32_t max_phys_off_afflvl;	/* Highest affinity level in physically
 					   powered off state */
+#if !USE_COHERENT_MEM
+	bakery_info_t pcpu_bakery_info[PSCI_NUM_AFFS];
+#endif
 } psci_cpu_data_t;
 
 /*******************************************************************************
diff --git a/include/lib/aarch64/arch_helpers.h b/include/lib/aarch64/arch_helpers.h
index 7320a0af43b7268c3f442eaa56b003f44ba5e01f..7d24a5378351d60b8fe0f5bbe91e4d7de5964585 100644
--- a/include/lib/aarch64/arch_helpers.h
+++ b/include/lib/aarch64/arch_helpers.h
@@ -175,6 +175,9 @@ DEFINE_SYSOP_FUNC(wfi)
 DEFINE_SYSOP_FUNC(wfe)
 DEFINE_SYSOP_FUNC(sev)
 DEFINE_SYSOP_TYPE_FUNC(dsb, sy)
+DEFINE_SYSOP_TYPE_FUNC(dmb, sy)
+DEFINE_SYSOP_TYPE_FUNC(dsb, ish)
+DEFINE_SYSOP_TYPE_FUNC(dmb, ish)
 DEFINE_SYSOP_FUNC(isb)
 
 uint32_t get_afflvl_shift(uint32_t);
diff --git a/include/lib/bakery_lock.h b/include/lib/bakery_lock.h
index 95634cf5480dba6949f141e16cabe080f3351bc2..9736f850accdfd5c54bb8afa851a178dec524f32 100644
--- a/include/lib/bakery_lock.h
+++ b/include/lib/bakery_lock.h
@@ -35,6 +35,11 @@
 
 #define BAKERY_LOCK_MAX_CPUS		PLATFORM_CORE_COUNT
 
+#ifndef __ASSEMBLY__
+#include <stdint.h>
+
+#if USE_COHERENT_MEM
+
 typedef struct bakery_lock {
 	int owner;
 	volatile char entering[BAKERY_LOCK_MAX_CPUS];
@@ -48,4 +53,21 @@ void bakery_lock_get(bakery_lock_t *bakery);
 void bakery_lock_release(bakery_lock_t *bakery);
 int bakery_lock_try(bakery_lock_t *bakery);
 
+#else
+
+typedef struct bakery_info {
+	/*
+	 * The lock_data is a bit-field of 2 members:
+	 * Bit[0]       : choosing. This field is set when the CPU is
+	 *                choosing its bakery number.
+	 * Bits[1 - 15] : number. This is the bakery number allocated.
+	 */
+	volatile uint16_t lock_data;
+} bakery_info_t;
+
+void bakery_lock_get(unsigned int id, unsigned int offset);
+void bakery_lock_release(unsigned int id, unsigned int offset);
+
+#endif /* __USE_COHERENT_MEM__ */
+#endif /* __ASSEMBLY__ */
 #endif /* __BAKERY_LOCK_H__ */
diff --git a/lib/locks/bakery/bakery_lock.c b/lib/locks/bakery/bakery_lock_coherent.c
similarity index 98%
rename from lib/locks/bakery/bakery_lock.c
rename to lib/locks/bakery/bakery_lock_coherent.c
index 7e71dec0f96656869d3b19be516cf5c8dff4fa36..5d538ce2c6afc301ed9d0ec6f92c2121f0ced92d 100644
--- a/lib/locks/bakery/bakery_lock.c
+++ b/lib/locks/bakery/bakery_lock_coherent.c
@@ -31,11 +31,13 @@
 #include <arch_helpers.h>
 #include <assert.h>
 #include <bakery_lock.h>
+#include <cpu_data.h>
 #include <platform.h>
 #include <string.h>
 
 /*
- * Functions in this file implement Bakery Algorithm for mutual exclusion.
+ * Functions in this file implement Bakery Algorithm for mutual exclusion with the
+ * bakery lock data structures in coherent memory.
  *
  * ARM architecture offers a family of exclusive access instructions to
  * efficiently implement mutual exclusion with hardware support. However, as
@@ -107,8 +109,6 @@ static unsigned int bakery_get_ticket(bakery_lock_t *bakery, unsigned int me)
 	++my_ticket;
 	bakery->number[me] = my_ticket;
 	bakery->entering[me] = 0;
-	dsb();
-	sev();
 
 	return my_ticket;
 }
@@ -151,7 +151,7 @@ void bakery_lock_get(bakery_lock_t *bakery)
 
 		/* Wait for the contender to get their ticket */
 		while (bakery->entering[they])
-			wfe();
+			;
 
 		/*
 		 * If the other party is a contender, they'll have non-zero
diff --git a/lib/locks/bakery/bakery_lock_normal.c b/lib/locks/bakery/bakery_lock_normal.c
new file mode 100644
index 0000000000000000000000000000000000000000..a325fd4feb2716ceff07c2ae9f2a39c948218b11
--- /dev/null
+++ b/lib/locks/bakery/bakery_lock_normal.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch_helpers.h>
+#include <assert.h>
+#include <bakery_lock.h>
+#include <cpu_data.h>
+#include <platform.h>
+#include <string.h>
+
+/*
+ * Functions in this file implement Bakery Algorithm for mutual exclusion with the
+ * bakery lock data structures in cacheable and Normal memory.
+ *
+ * ARM architecture offers a family of exclusive access instructions to
+ * efficiently implement mutual exclusion with hardware support. However, as
+ * well as depending on external hardware, these instructions have defined
+ * behavior only on certain memory types (cacheable and Normal memory in
+ * particular; see ARMv8 Architecture Reference Manual section B2.10). Use cases
+ * in trusted firmware are such that mutual exclusion implementation cannot
+ * expect that accesses to the lock have the specific type required by the
+ * architecture for these primitives to function (for example, not all
+ * contenders may have address translation enabled).
+ *
+ * This implementation does not use mutual exclusion primitives. It expects
+ * memory regions where the locks reside to be cacheable and Normal.
+ *
+ * Note that the ARM architecture guarantees single-copy atomicity for aligned
+ * accesses regardless of status of address translation.
+ */
+
+/* Convert a ticket to priority */
+#define PRIORITY(t, pos)	(((t) << 8) | (pos))
+
+#define CHOOSING_TICKET		0x1
+#define CHOOSING_DONE		0x0
+
+#define bakery_is_choosing(info)	(info & 0x1)
+#define bakery_ticket_number(info)	((info >> 1) & 0x7FFF)
+#define make_bakery_data(choosing, number) \
+		(((choosing & 0x1) | (number << 1)) & 0xFFFF)
+
+/* This macro assumes that the bakery_info array is located at the offset specified */
+#define get_my_bakery_info(offset, id)		\
+	(((bakery_info_t *) (((uint8_t *)_cpu_data()) + offset)) + id)
+
+#define get_bakery_info_by_index(offset, id, ix)	\
+	(((bakery_info_t *) (((uint8_t *)_cpu_data_by_index(ix)) + offset)) + id)
+
+#define write_cache_op(addr, cached)	\
+				do {	\
+					(cached ? dccvac((uint64_t)addr) :\
+						dcivac((uint64_t)addr));\
+						dsbish();\
+				} while (0)
+
+#define read_cache_op(addr, cached)	if (cached) \
+					    dccivac((uint64_t)addr)
+
+static unsigned int bakery_get_ticket(int id, unsigned int offset,
+						unsigned int me, int is_cached)
+{
+	unsigned int my_ticket, their_ticket;
+	unsigned int they;
+	bakery_info_t *my_bakery_info, *their_bakery_info;
+
+	/*
+	 * Obtain a reference to the bakery information for this cpu and ensure
+	 * it is not NULL.
+	 */
+	my_bakery_info = get_my_bakery_info(offset, id);
+	assert(my_bakery_info);
+
+	/*
+	 * Tell other contenders that we are through the bakery doorway i.e.
+	 * going to allocate a ticket for this cpu.
+	 */
+	my_ticket = 0;
+	my_bakery_info->lock_data = make_bakery_data(CHOOSING_TICKET, my_ticket);
+
+	write_cache_op(my_bakery_info, is_cached);
+
+	/*
+	 * Iterate through the bakery information of each contender to allocate
+	 * the highest ticket number for this cpu.
+	 */
+	for (they = 0; they < BAKERY_LOCK_MAX_CPUS; they++) {
+		if (me == they)
+			continue;
+
+		/*
+		 * Get a reference to the other contender's bakery info and
+		 * ensure that a stale copy is not read.
+		 */
+		their_bakery_info = get_bakery_info_by_index(offset, id, they);
+		assert(their_bakery_info);
+
+		read_cache_op(their_bakery_info, is_cached);
+
+		/*
+		 * Update this cpu's ticket number if a higher ticket number is
+		 * seen
+		 */
+		their_ticket = bakery_ticket_number(their_bakery_info->lock_data);
+		if (their_ticket > my_ticket)
+			my_ticket = their_ticket;
+	}
+
+	/*
+	 * Compute ticket; then signal to other contenders waiting for us to
+	 * finish calculating our ticket value that we're done
+	 */
+	++my_ticket;
+	my_bakery_info->lock_data = make_bakery_data(CHOOSING_DONE, my_ticket);
+
+	write_cache_op(my_bakery_info, is_cached);
+
+	return my_ticket;
+}
+
+void bakery_lock_get(unsigned int id, unsigned int offset)
+{
+	unsigned int they, me, is_cached;
+	unsigned int my_ticket, my_prio, their_ticket;
+	bakery_info_t *their_bakery_info;
+	uint16_t their_bakery_data;
+
+	me = platform_get_core_pos(read_mpidr_el1());
+
+	is_cached = read_sctlr_el3() & SCTLR_C_BIT;
+
+	/* Get a ticket */
+	my_ticket = bakery_get_ticket(id, offset, me, is_cached);
+
+	/*
+	 * Now that we got our ticket, compute our priority value, then compare
+	 * with that of others, and proceed to acquire the lock
+	 */
+	my_prio = PRIORITY(my_ticket, me);
+	for (they = 0; they < BAKERY_LOCK_MAX_CPUS; they++) {
+		if (me == they)
+			continue;
+
+		/*
+		 * Get a reference to the other contender's bakery info and
+		 * ensure that a stale copy is not read.
+		 */
+		their_bakery_info = get_bakery_info_by_index(offset, id, they);
+		assert(their_bakery_info);
+		read_cache_op(their_bakery_info, is_cached);
+
+		their_bakery_data = their_bakery_info->lock_data;
+
+		/* Wait for the contender to get their ticket */
+		while (bakery_is_choosing(their_bakery_data)) {
+			read_cache_op(their_bakery_info, is_cached);
+			their_bakery_data = their_bakery_info->lock_data;
+		}
+
+		/*
+		 * If the other party is a contender, they'll have non-zero
+		 * (valid) ticket value. If they do, compare priorities
+		 */
+		their_ticket = bakery_ticket_number(their_bakery_data);
+		if (their_ticket && (PRIORITY(their_ticket, they) < my_prio)) {
+			/*
+			 * They have higher priority (lower value). Wait for
+			 * their ticket value to change (either release the lock
+			 * to have it dropped to 0; or drop and probably content
+			 * again for the same lock to have an even higher value)
+			 */
+			do {
+				wfe();
+				read_cache_op(their_bakery_info, is_cached);
+			} while (their_ticket
+				== bakery_ticket_number(their_bakery_info->lock_data));
+		}
+	}
+}
+
+void bakery_lock_release(unsigned int id, unsigned int offset)
+{
+	bakery_info_t *my_bakery_info;
+	unsigned int is_cached = read_sctlr_el3() & SCTLR_C_BIT;
+
+	my_bakery_info = get_my_bakery_info(offset, id);
+	my_bakery_info->lock_data = 0;
+	write_cache_op(my_bakery_info, is_cached);
+	sev();
+}
diff --git a/plat/fvp/aarch64/fvp_common.c b/plat/fvp/aarch64/fvp_common.c
index 987f48f6ae3135fa3bb9a80903ca5497798cd971..e20fe7d900131c64c59e4484104ee49f713c1bed 100644
--- a/plat/fvp/aarch64/fvp_common.c
+++ b/plat/fvp/aarch64/fvp_common.c
@@ -136,7 +136,8 @@ const unsigned int num_sec_irqs = sizeof(irq_sec_array) /
  * Macro generating the code for the function setting up the pagetables as per
  * the platform memory map & initialize the mmu, for the given exception level
  ******************************************************************************/
-#define DEFINE_CONFIGURE_MMU_EL(_el)					\
+#if USE_COHERENT_MEM
+#define DEFINE_CONFIGURE_MMU_EL(_el)				\
 	void fvp_configure_mmu_el##_el(unsigned long total_base,	\
 				   unsigned long total_size,		\
 				   unsigned long ro_start,		\
@@ -158,6 +159,25 @@ const unsigned int num_sec_irqs = sizeof(irq_sec_array) /
 									\
 		enable_mmu_el##_el(0);					\
 	}
+#else
+#define DEFINE_CONFIGURE_MMU_EL(_el)				\
+	void fvp_configure_mmu_el##_el(unsigned long total_base,	\
+				   unsigned long total_size,		\
+				   unsigned long ro_start,		\
+				   unsigned long ro_limit)		\
+	{								\
+		mmap_add_region(total_base, total_base,			\
+				total_size,				\
+				MT_MEMORY | MT_RW | MT_SECURE);		\
+		mmap_add_region(ro_start, ro_start,			\
+				ro_limit - ro_start,			\
+				MT_MEMORY | MT_RO | MT_SECURE);		\
+		mmap_add(fvp_mmap);					\
+		init_xlat_tables();					\
+									\
+		enable_mmu_el##_el(0);					\
+	}
+#endif
 
 /* Define EL1 and EL3 variants of the function initialising the MMU */
 DEFINE_CONFIGURE_MMU_EL(1)
diff --git a/plat/fvp/bl1_fvp_setup.c b/plat/fvp/bl1_fvp_setup.c
index b1205d43529dda27ae6390088ea4bbaa6f62e873..4b421d71ab776794f670cc2dcd613d31bb5aaf56 100644
--- a/plat/fvp/bl1_fvp_setup.c
+++ b/plat/fvp/bl1_fvp_setup.c
@@ -40,6 +40,7 @@
 #include "fvp_def.h"
 #include "fvp_private.h"
 
+#if USE_COHERENT_MEM
 /*******************************************************************************
  * Declarations of linker defined symbols which will help us find the layout
  * of trusted SRAM
@@ -56,6 +57,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL1_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL1_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /* Data structure which holds the extents of the trusted SRAM for BL1*/
 static meminfo_t bl1_tzram_layout;
@@ -116,9 +118,12 @@ void bl1_plat_arch_setup(void)
 	fvp_configure_mmu_el3(bl1_tzram_layout.total_base,
 			      bl1_tzram_layout.total_size,
 			      BL1_RO_BASE,
-			      BL1_RO_LIMIT,
-			      BL1_COHERENT_RAM_BASE,
-			      BL1_COHERENT_RAM_LIMIT);
+			      BL1_RO_LIMIT
+#if USE_COHERENT_MEM
+			      , BL1_COHERENT_RAM_BASE,
+			      BL1_COHERENT_RAM_LIMIT
+#endif
+			     );
 }
 
 
diff --git a/plat/fvp/bl2_fvp_setup.c b/plat/fvp/bl2_fvp_setup.c
index 67f89bc4702ad66503172fcdc896e32f284cbcd0..71bd8c2a40051daa47a7efd2bf1897d0c01e5cdf 100644
--- a/plat/fvp/bl2_fvp_setup.c
+++ b/plat/fvp/bl2_fvp_setup.c
@@ -45,8 +45,10 @@
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
  * The next 2 constants identify the extents of the code & RO data region.
@@ -57,6 +59,7 @@ extern unsigned long __COHERENT_RAM_END__;
 #define BL2_RO_BASE (unsigned long)(&__RO_START__)
 #define BL2_RO_LIMIT (unsigned long)(&__RO_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -66,11 +69,11 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL2_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL2_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /* Data structure which holds the extents of the trusted SRAM for BL2 */
 static meminfo_t bl2_tzram_layout
-__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE),
-		section("tzfw_coherent_mem")));
+__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE)));
 
 /* Assert that BL3-1 parameters fit in shared memory */
 CASSERT((PARAMS_BASE + sizeof(bl2_to_bl31_params_mem_t)) <
@@ -209,9 +212,12 @@ void bl2_plat_arch_setup(void)
 	fvp_configure_mmu_el1(bl2_tzram_layout.total_base,
 			      bl2_tzram_layout.total_size,
 			      BL2_RO_BASE,
-			      BL2_RO_LIMIT,
-			      BL2_COHERENT_RAM_BASE,
-			      BL2_COHERENT_RAM_LIMIT);
+			      BL2_RO_LIMIT
+#if USE_COHERENT_MEM
+			      , BL2_COHERENT_RAM_BASE,
+			      BL2_COHERENT_RAM_LIMIT
+#endif
+			      );
 }
 
 /*******************************************************************************
diff --git a/plat/fvp/bl31_fvp_setup.c b/plat/fvp/bl31_fvp_setup.c
index 69efc9cf0330b16ad512919bbde6ffcebe2d69b2..3874413f009ab0a7e8e05185dc17ae0e230d8660 100644
--- a/plat/fvp/bl31_fvp_setup.c
+++ b/plat/fvp/bl31_fvp_setup.c
@@ -48,19 +48,25 @@
  ******************************************************************************/
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
+extern unsigned long __BL31_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
- * The next 2 constants identify the extents of the code & RO data region.
- * These addresses are used by the MMU setup code and therefore they must be
- * page-aligned.  It is the responsibility of the linker script to ensure that
- * __RO_START__ and __RO_END__ linker symbols refer to page-aligned addresses.
+ * The next 3 constants identify the extents of the code, RO data region and the
+ * limit of the BL3-1 image.  These addresses are used by the MMU setup code and
+ * therefore they must be page-aligned.  It is the responsibility of the linker
+ * script to ensure that __RO_START__, __RO_END__ & __BL31_END__ linker symbols
+ * refer to page-aligned addresses.
  */
 #define BL31_RO_BASE (unsigned long)(&__RO_START__)
 #define BL31_RO_LIMIT (unsigned long)(&__RO_END__)
+#define BL31_END (unsigned long)(&__BL31_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -70,7 +76,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL31_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL31_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
-
+#endif
 
 #if RESET_TO_BL31
 static entry_point_info_t bl32_image_ep_info;
@@ -235,9 +241,12 @@ void bl31_plat_arch_setup(void)
 	fvp_cci_enable();
 #endif
 	fvp_configure_mmu_el3(BL31_RO_BASE,
-			      (BL31_COHERENT_RAM_LIMIT - BL31_RO_BASE),
+			      (BL31_END - BL31_RO_BASE),
 			      BL31_RO_BASE,
-			      BL31_RO_LIMIT,
-			      BL31_COHERENT_RAM_BASE,
-			      BL31_COHERENT_RAM_LIMIT);
+			      BL31_RO_LIMIT
+#if USE_COHERENT_MEM
+			      , BL31_COHERENT_RAM_BASE,
+			      BL31_COHERENT_RAM_LIMIT
+#endif
+			      );
 }
diff --git a/plat/fvp/drivers/pwrc/fvp_pwrc.c b/plat/fvp/drivers/pwrc/fvp_pwrc.c
index c32c322bf2fc92085ad98c601f4b42a473aa008b..0497c2b81c22a2dc753b8c0a00a711c809c7f3b5 100644
--- a/plat/fvp/drivers/pwrc/fvp_pwrc.c
+++ b/plat/fvp/drivers/pwrc/fvp_pwrc.c
@@ -31,13 +31,19 @@
 #include <bakery_lock.h>
 #include <mmio.h>
 #include "../../fvp_def.h"
+#include "../../fvp_private.h"
 #include "fvp_pwrc.h"
 
 /*
  * TODO: Someday there will be a generic power controller api. At the moment
  * each platform has its own pwrc so just exporting functions is fine.
  */
+#if USE_COHERENT_MEM
 static bakery_lock_t pwrc_lock __attribute__ ((section("tzfw_coherent_mem")));
+#define LOCK_ARG	&pwrc_lock
+#else
+#define LOCK_ARG	FVP_PWRC_BAKERY_ID
+#endif
 
 unsigned int fvp_pwrc_get_cpu_wkr(unsigned long mpidr)
 {
@@ -47,54 +53,55 @@ unsigned int fvp_pwrc_get_cpu_wkr(unsigned long mpidr)
 unsigned int fvp_pwrc_read_psysr(unsigned long mpidr)
 {
 	unsigned int rc;
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PSYSR_OFF, (unsigned int) mpidr);
 	rc = mmio_read_32(PWRC_BASE + PSYSR_OFF);
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 	return rc;
 }
 
 void fvp_pwrc_write_pponr(unsigned long mpidr)
 {
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PPONR_OFF, (unsigned int) mpidr);
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 }
 
 void fvp_pwrc_write_ppoffr(unsigned long mpidr)
 {
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PPOFFR_OFF, (unsigned int) mpidr);
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 }
 
 void fvp_pwrc_set_wen(unsigned long mpidr)
 {
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PWKUPR_OFF,
 		      (unsigned int) (PWKUPR_WEN | mpidr));
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 }
 
 void fvp_pwrc_clr_wen(unsigned long mpidr)
 {
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PWKUPR_OFF,
 		      (unsigned int) mpidr);
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 }
 
 void fvp_pwrc_write_pcoffr(unsigned long mpidr)
 {
-	bakery_lock_get(&pwrc_lock);
+	fvp_lock_get(LOCK_ARG);
 	mmio_write_32(PWRC_BASE + PCOFFR_OFF, (unsigned int) mpidr);
-	bakery_lock_release(&pwrc_lock);
+	fvp_lock_release(LOCK_ARG);
 }
 
 /* Nothing else to do here apart from initializing the lock */
 int fvp_pwrc_setup(void)
 {
-	bakery_lock_init(&pwrc_lock);
+	fvp_lock_init(LOCK_ARG);
+
 	return 0;
 }
 
diff --git a/plat/fvp/fvp_private.h b/plat/fvp/fvp_private.h
index 2dcb327ff1ac87ce998a9474d2b15c03f414d5a8..3949754b9b8824fded9f5e5addf2144755a35243 100644
--- a/plat/fvp/fvp_private.h
+++ b/plat/fvp/fvp_private.h
@@ -31,7 +31,9 @@
 #ifndef __FVP_PRIVATE_H__
 #define __FVP_PRIVATE_H__
 
+#include <bakery_lock.h>
 #include <bl_common.h>
+#include <cpu_data.h>
 #include <platform_def.h>
 
 
@@ -55,10 +57,60 @@ typedef struct bl2_to_bl31_params_mem {
 	entry_point_info_t bl31_ep_info;
 } bl2_to_bl31_params_mem_t;
 
+#if USE_COHERENT_MEM
+/*
+ * These are wrapper macros to the Coherent Memory Bakery Lock API.
+ */
+#define fvp_lock_init(_lock_arg)	bakery_lock_init(_lock_arg)
+#define fvp_lock_get(_lock_arg)		bakery_lock_get(_lock_arg)
+#define fvp_lock_release(_lock_arg)	bakery_lock_release(_lock_arg)
+
+#else
+
 /*******************************************************************************
- * Forward declarations
+ * Constants to specify how many bakery locks this platform implements. These
+ * are used if the platform chooses not to use coherent memory for bakery lock
+ * data structures.
  ******************************************************************************/
-struct meminfo;
+#define FVP_MAX_BAKERIES	1
+#define FVP_PWRC_BAKERY_ID	0
+
+/*******************************************************************************
+ * Definition of structure which holds platform specific per-cpu data. Currently
+ * it holds only the bakery lock information for each cpu. Constants to
+ * specify how many bakeries this platform implements and bakery ids are
+ * specified in fvp_def.h
+ ******************************************************************************/
+typedef struct fvp_cpu_data {
+	bakery_info_t pcpu_bakery_info[FVP_MAX_BAKERIES];
+} fvp_cpu_data_t;
+
+/* Macro to define the offset of bakery_info_t in fvp_cpu_data_t */
+#define FVP_CPU_DATA_LOCK_OFFSET	__builtin_offsetof\
+					    (fvp_cpu_data_t, pcpu_bakery_info)
+
+
+/*******************************************************************************
+ * Helper macros for bakery lock api when using the above fvp_cpu_data_t for
+ * bakery lock data structures. It assumes that the bakery_info is at the
+ * beginning of the platform specific per-cpu data.
+ ******************************************************************************/
+#define fvp_lock_init(_lock_arg)	/* No init required */
+#define fvp_lock_get(_lock_arg)		bakery_lock_get(_lock_arg,  	    \
+						CPU_DATA_PLAT_PCPU_OFFSET + \
+						FVP_CPU_DATA_LOCK_OFFSET)
+#define fvp_lock_release(_lock_arg)	bakery_lock_release(_lock_arg,	    \
+						CPU_DATA_PLAT_PCPU_OFFSET + \
+						FVP_CPU_DATA_LOCK_OFFSET)
+
+/*
+ * Ensure that the size of the FVP specific per-cpu data structure and the size
+ * of the memory allocated in generic per-cpu data for the platform are the same.
+ */
+CASSERT(PLAT_PCPU_DATA_SIZE == sizeof(fvp_cpu_data_t),	\
+	fvp_pcpu_data_size_mismatch);
+
+#endif /* __USE_COHERENT_MEM__ */
 
 /*******************************************************************************
  * Function and variable prototypes
@@ -66,15 +118,22 @@ struct meminfo;
 void fvp_configure_mmu_el1(unsigned long total_base,
 			   unsigned long total_size,
 			   unsigned long,
-			   unsigned long,
-			   unsigned long,
-			   unsigned long);
+			   unsigned long
+#if USE_COHERENT_MEM
+			   , unsigned long,
+			   unsigned long
+#endif
+			   );
 void fvp_configure_mmu_el3(unsigned long total_base,
 			   unsigned long total_size,
 			   unsigned long,
-			   unsigned long,
-			   unsigned long,
-			   unsigned long);
+			   unsigned long
+#if USE_COHERENT_MEM
+			   , unsigned long,
+			   unsigned long
+#endif
+			   );
+
 int fvp_config_setup(void);
 
 void fvp_cci_init(void);
diff --git a/plat/fvp/include/platform_def.h b/plat/fvp/include/platform_def.h
index 5364a3da2f491f0593b6adf156cea333aeb830c9..e3c48e67193f4f21b9e67fadf6f449d94144e123 100644
--- a/plat/fvp/include/platform_def.h
+++ b/plat/fvp/include/platform_def.h
@@ -169,5 +169,12 @@
 #define CACHE_WRITEBACK_SHIFT   6
 #define CACHE_WRITEBACK_GRANULE (1 << CACHE_WRITEBACK_SHIFT)
 
+#if !USE_COHERENT_MEM
+/*******************************************************************************
+ * Size of the per-cpu data in bytes that should be reserved in the generic
+ * per-cpu data structure for the FVP port.
+ ******************************************************************************/
+#define PLAT_PCPU_DATA_SIZE	2
+#endif
 
 #endif /* __PLATFORM_DEF_H__ */
diff --git a/plat/fvp/tsp/tsp_fvp_setup.c b/plat/fvp/tsp/tsp_fvp_setup.c
index 301f6693be8d52b5ccfa1e727bd4e58634c531a1..d8f46bd2652b0f00548fd0bac942f0e5b5ba3805 100644
--- a/plat/fvp/tsp/tsp_fvp_setup.c
+++ b/plat/fvp/tsp/tsp_fvp_setup.c
@@ -40,19 +40,25 @@
  ******************************************************************************/
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
+extern unsigned long __BL32_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
- * The next 2 constants identify the extents of the code & RO data region.
- * These addresses are used by the MMU setup code and therefore they must be
- * page-aligned.  It is the responsibility of the linker script to ensure that
- * __RO_START__ and __RO_END__ linker symbols refer to page-aligned addresses.
+ * The next 3 constants identify the extents of the code & RO data region and
+ * the limit of the BL3-2 image. These addresses are used by the MMU setup code
+ * and therefore they must be page-aligned.  It is the responsibility of the
+ * linker script to ensure that __RO_START__, __RO_END__ & & __BL32_END__
+ * linker symbols refer to page-aligned addresses.
  */
 #define BL32_RO_BASE (unsigned long)(&__RO_START__)
 #define BL32_RO_LIMIT (unsigned long)(&__RO_END__)
+#define BL32_END (unsigned long)(&__BL32_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -62,6 +68,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL32_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL32_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /*******************************************************************************
  * Initialize the UART
@@ -93,9 +100,12 @@ void tsp_platform_setup(void)
 void tsp_plat_arch_setup(void)
 {
 	fvp_configure_mmu_el1(BL32_RO_BASE,
-			      (BL32_COHERENT_RAM_LIMIT - BL32_RO_BASE),
+			      (BL32_END - BL32_RO_BASE),
 			      BL32_RO_BASE,
-			      BL32_RO_LIMIT,
-			      BL32_COHERENT_RAM_BASE,
-			      BL32_COHERENT_RAM_LIMIT);
+			      BL32_RO_LIMIT
+#if USE_COHERENT_MEM
+			      , BL32_COHERENT_RAM_BASE,
+			      BL32_COHERENT_RAM_LIMIT
+#endif
+			      );
 }
diff --git a/plat/juno/aarch64/juno_common.c b/plat/juno/aarch64/juno_common.c
index 8129b0511057cfc9906aaa1f60fa702cdcb0e781..7ad40d0da9f64be3b38436a04802a4d25e14097f 100644
--- a/plat/juno/aarch64/juno_common.c
+++ b/plat/juno/aarch64/juno_common.c
@@ -140,6 +140,7 @@ const unsigned int num_sec_irqs = sizeof(irq_sec_array) /
  * Macro generating the code for the function setting up the pagetables as per
  * the platform memory map & initialize the mmu, for the given exception level
  ******************************************************************************/
+#if USE_COHERENT_MEM
 #define DEFINE_CONFIGURE_MMU_EL(_el)				\
 	void configure_mmu_el##_el(unsigned long total_base,	\
 				  unsigned long total_size,	\
@@ -162,7 +163,25 @@ const unsigned int num_sec_irqs = sizeof(irq_sec_array) /
 								\
 	       enable_mmu_el##_el(0);				\
 	}
-
+#else
+#define DEFINE_CONFIGURE_MMU_EL(_el)				\
+	void configure_mmu_el##_el(unsigned long total_base,	\
+				  unsigned long total_size,	\
+				  unsigned long ro_start,	\
+				  unsigned long ro_limit)	\
+	{							\
+	       mmap_add_region(total_base, total_base,		\
+			       total_size,			\
+			       MT_MEMORY | MT_RW | MT_SECURE);	\
+	       mmap_add_region(ro_start, ro_start,		\
+			       ro_limit - ro_start,		\
+			       MT_MEMORY | MT_RO | MT_SECURE);	\
+	       mmap_add(juno_mmap);				\
+	       init_xlat_tables();				\
+								\
+	       enable_mmu_el##_el(0);				\
+	}
+#endif
 /* Define EL1 and EL3 variants of the function initialising the MMU */
 DEFINE_CONFIGURE_MMU_EL(1)
 DEFINE_CONFIGURE_MMU_EL(3)
diff --git a/plat/juno/bl1_plat_setup.c b/plat/juno/bl1_plat_setup.c
index e27e3948925576ef767bac8b23a127efade10997..23e8592be5a2449b1a4c84b9eff13c0941cde02c 100644
--- a/plat/juno/bl1_plat_setup.c
+++ b/plat/juno/bl1_plat_setup.c
@@ -41,6 +41,7 @@
 #include "juno_def.h"
 #include "juno_private.h"
 
+#if USE_COHERENT_MEM
 /*******************************************************************************
  * Declarations of linker defined symbols which will help us find the layout
  * of trusted RAM
@@ -57,6 +58,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL1_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL1_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /* Data structure which holds the extents of the trusted RAM for BL1 */
 static meminfo_t bl1_tzram_layout;
@@ -189,9 +191,12 @@ void bl1_plat_arch_setup(void)
 	configure_mmu_el3(bl1_tzram_layout.total_base,
 			  bl1_tzram_layout.total_size,
 			  TZROM_BASE,
-			  TZROM_BASE + TZROM_SIZE,
-			  BL1_COHERENT_RAM_BASE,
-			  BL1_COHERENT_RAM_LIMIT);
+			  TZROM_BASE + TZROM_SIZE
+#if USE_COHERENT_MEM
+			  , BL1_COHERENT_RAM_BASE,
+			  BL1_COHERENT_RAM_LIMIT
+#endif
+			  );
 }
 
 /*******************************************************************************
diff --git a/plat/juno/bl2_plat_setup.c b/plat/juno/bl2_plat_setup.c
index 900a587fdb11689ec46f5f19fb73ac88729b76b5..8e7b2a0ae819c26115df7bf9bf0b00dbfb748690 100644
--- a/plat/juno/bl2_plat_setup.c
+++ b/plat/juno/bl2_plat_setup.c
@@ -47,8 +47,10 @@
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
  * The next 2 constants identify the extents of the code & RO data region.
@@ -59,6 +61,7 @@ extern unsigned long __COHERENT_RAM_END__;
 #define BL2_RO_BASE (unsigned long)(&__RO_START__)
 #define BL2_RO_LIMIT (unsigned long)(&__RO_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -68,11 +71,11 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL2_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL2_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /* Data structure which holds the extents of the trusted RAM for BL2 */
 static meminfo_t bl2_tzram_layout
-__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE),
-		section("tzfw_coherent_mem")));
+__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE)));
 
 /*******************************************************************************
  * Structure which holds the arguments which need to be passed to BL3-1
@@ -194,9 +197,12 @@ void bl2_plat_arch_setup(void)
 	configure_mmu_el1(bl2_tzram_layout.total_base,
 			  bl2_tzram_layout.total_size,
 			  BL2_RO_BASE,
-			  BL2_RO_LIMIT,
-			  BL2_COHERENT_RAM_BASE,
-			  BL2_COHERENT_RAM_LIMIT);
+			  BL2_RO_LIMIT
+#if USE_COHERENT_MEM
+			  , BL2_COHERENT_RAM_BASE,
+			  BL2_COHERENT_RAM_LIMIT
+#endif
+			  );
 }
 
 /*******************************************************************************
diff --git a/plat/juno/bl31_plat_setup.c b/plat/juno/bl31_plat_setup.c
index c4504622529e21e6eb70a5bda4450a29a0bf1bff..ad8ea4351ddd41a3efbd755985ecb6980406c119 100644
--- a/plat/juno/bl31_plat_setup.c
+++ b/plat/juno/bl31_plat_setup.c
@@ -48,19 +48,25 @@
  ******************************************************************************/
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
+extern unsigned long __BL31_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
- * The next 2 constants identify the extents of the code & RO data region.
- * These addresses are used by the MMU setup code and therefore they must be
- * page-aligned.  It is the responsibility of the linker script to ensure that
- * __RO_START__ and __RO_END__ linker symbols refer to page-aligned addresses.
+ * The next 3 constants identify the extents of the code, RO data region and the
+ * limit of the BL3-1 image.  These addresses are used by the MMU setup code and
+ * therefore they must be page-aligned.  It is the responsibility of the linker
+ * script to ensure that __RO_START__, __RO_END__ & __BL31_END__ linker symbols
+ * refer to page-aligned addresses.
  */
 #define BL31_RO_BASE (unsigned long)(&__RO_START__)
 #define BL31_RO_LIMIT (unsigned long)(&__RO_END__)
+#define BL31_END (unsigned long)(&__BL31_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -70,6 +76,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL31_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL31_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /******************************************************************************
  * Placeholder variables for copying the arguments that have been passed to
@@ -178,9 +185,13 @@ void bl31_platform_setup(void)
 void bl31_plat_arch_setup()
 {
 	configure_mmu_el3(BL31_RO_BASE,
-			  BL31_COHERENT_RAM_LIMIT - BL31_RO_BASE,
+			  (BL31_END - BL31_RO_BASE),
 			  BL31_RO_BASE,
-			  BL31_RO_LIMIT,
+			  BL31_RO_LIMIT
+#if USE_COHERENT_MEM
+			  ,
 			  BL31_COHERENT_RAM_BASE,
-			  BL31_COHERENT_RAM_LIMIT);
+			  BL31_COHERENT_RAM_LIMIT
+#endif
+			  );
 }
diff --git a/plat/juno/include/platform_def.h b/plat/juno/include/platform_def.h
index ee77b83237fdf2977393ce7cd576f559d55549ed..cd077021ebdb48157ac39f86b2a9a85fd8c73918 100644
--- a/plat/juno/include/platform_def.h
+++ b/plat/juno/include/platform_def.h
@@ -174,4 +174,12 @@
 #define CACHE_WRITEBACK_SHIFT   6
 #define CACHE_WRITEBACK_GRANULE (1 << CACHE_WRITEBACK_SHIFT)
 
+#if !USE_COHERENT_MEM
+/*******************************************************************************
+ * Size of the per-cpu data in bytes that should be reserved in the generic
+ * per-cpu data structure for the Juno port.
+ ******************************************************************************/
+#define PLAT_PCPU_DATA_SIZE	2
+#endif
+
 #endif /* __PLATFORM_DEF_H__ */
diff --git a/plat/juno/juno_private.h b/plat/juno/juno_private.h
index 14d7af4dad2eab290ecdb0bd428e470d959c8b32..70439e8b5d68912f7f21b5a8d260bbd999a85941 100644
--- a/plat/juno/juno_private.h
+++ b/plat/juno/juno_private.h
@@ -31,7 +31,9 @@
 #ifndef __JUNO_PRIVATE_H__
 #define __JUNO_PRIVATE_H__
 
+#include <bakery_lock.h>
 #include <bl_common.h>
+#include <cpu_data.h>
 #include <platform_def.h>
 #include <stdint.h>
 
@@ -59,6 +61,68 @@ typedef struct bl2_to_bl31_params_mem {
 	struct entry_point_info bl31_ep_info;
 } bl2_to_bl31_params_mem_t;
 
+#if IMAGE_BL31
+#if USE_COHERENT_MEM
+/*
+ * These are wrapper macros to the Coherent Memory Bakery Lock API.
+ */
+#define juno_lock_init(_lock_arg)		bakery_lock_init(_lock_arg)
+#define juno_lock_get(_lock_arg)		bakery_lock_get(_lock_arg)
+#define juno_lock_release(_lock_arg)		bakery_lock_release(_lock_arg)
+
+#else
+
+/*******************************************************************************
+ * Constants that specify how many bakeries this platform implements and bakery
+ * ids.
+ ******************************************************************************/
+#define JUNO_MAX_BAKERIES	1
+#define JUNO_MHU_BAKERY_ID	0
+
+/*******************************************************************************
+ * Definition of structure which holds platform specific per-cpu data. Currently
+ * it holds only the bakery lock information for each cpu. Constants to specify
+ * how many bakeries this platform implements and bakery ids are specified in
+ * juno_def.h
+ ******************************************************************************/
+typedef struct juno_cpu_data {
+	bakery_info_t pcpu_bakery_info[JUNO_MAX_BAKERIES];
+} juno_cpu_data_t;
+
+/* Macro to define the offset of bakery_info_t in juno_cpu_data_t */
+#define JUNO_CPU_DATA_LOCK_OFFSET	__builtin_offsetof\
+					    (juno_cpu_data_t, pcpu_bakery_info)
+
+/*******************************************************************************
+ * Helper macros for bakery lock api when using the above juno_cpu_data_t for
+ * bakery lock data structures. It assumes that the bakery_info is at the
+ * beginning of the platform specific per-cpu data.
+ ******************************************************************************/
+#define juno_lock_init(_lock_arg)		/* No init required */
+#define juno_lock_get(_lock_arg)		bakery_lock_get(_lock_arg,	\
+						    CPU_DATA_PLAT_PCPU_OFFSET + \
+						    JUNO_CPU_DATA_LOCK_OFFSET)
+#define juno_lock_release(_lock_arg)		bakery_lock_release(_lock_arg,	\
+						    CPU_DATA_PLAT_PCPU_OFFSET + \
+						    JUNO_CPU_DATA_LOCK_OFFSET)
+
+/*
+ * Ensure that the size of the Juno specific per-cpu data structure and the size
+ * of the memory allocated in generic per-cpu data for the platform are the same.
+ */
+CASSERT(PLAT_PCPU_DATA_SIZE == sizeof(juno_cpu_data_t),	\
+	juno_pcpu_data_size_mismatch);
+#endif /* __USE_COHERENT_MEM__ */
+#else
+/*
+ * Dummy wrapper macros for all other BL stages other than BL3-1
+ */
+#define juno_lock_init(_lock_arg)
+#define juno_lock_get(_lock_arg)
+#define juno_lock_release(_lock_arg)
+
+#endif /* __IMAGE_BL31__ */
+
 /*******************************************************************************
  * Function and variable prototypes
  ******************************************************************************/
@@ -70,15 +134,21 @@ unsigned int platform_get_core_pos(unsigned long mpidr);
 void configure_mmu_el1(unsigned long total_base,
 		       unsigned long total_size,
 		       unsigned long ro_start,
-		       unsigned long ro_limit,
-		       unsigned long coh_start,
-		       unsigned long coh_limit);
+		       unsigned long ro_limit
+#if USE_COHERENT_MEM
+		       , unsigned long coh_start,
+		       unsigned long coh_limit
+#endif
+		       );
 void configure_mmu_el3(unsigned long total_base,
 		       unsigned long total_size,
 		       unsigned long ro_start,
-		       unsigned long ro_limit,
-		       unsigned long coh_start,
-		       unsigned long coh_limit);
+		       unsigned long ro_limit
+#if USE_COHERENT_MEM
+		       , unsigned long coh_start,
+		       unsigned long coh_limit
+#endif
+		       );
 void plat_report_exception(unsigned long type);
 unsigned long plat_get_ns_image_entrypoint(void);
 unsigned long platform_get_stack(unsigned long mpidr);
diff --git a/plat/juno/mhu.c b/plat/juno/mhu.c
index b6541a8873e8cc8ca95aead4adbf50413a7c2cd6..c1c414c29a7298a71044395c6d83f94edf7e768f 100644
--- a/plat/juno/mhu.c
+++ b/plat/juno/mhu.c
@@ -32,6 +32,7 @@
 #include <bakery_lock.h>
 #include <mmio.h>
 #include "juno_def.h"
+#include "juno_private.h"
 #include "mhu.h"
 
 /* SCP MHU secure channel registers */
@@ -44,13 +45,20 @@
 #define CPU_INTR_S_SET		0x308
 #define CPU_INTR_S_CLEAR	0x310
 
-
+#if IMAGE_BL31
+#if USE_COHERENT_MEM
 static bakery_lock_t mhu_secure_lock __attribute__ ((section("tzfw_coherent_mem")));
-
+#define LOCK_ARG		&mhu_secure_lock
+#else
+#define LOCK_ARG		JUNO_MHU_BAKERY_ID
+#endif /*__USE_COHERENT_MEM__ */
+#else
+#define LOCK_ARG	/* Locks required only for BL3-1 images */
+#endif /* __IMAGE_BL31__ */
 
 void mhu_secure_message_start(void)
 {
-	bakery_lock_get(&mhu_secure_lock);
+	juno_lock_get(LOCK_ARG);
 
 	/* Make sure any previous command has finished */
 	while (mmio_read_32(MHU_BASE + CPU_INTR_S_STAT) != 0)
@@ -80,12 +88,12 @@ void mhu_secure_message_end(void)
 	/* Clear any response we got by writing all ones to the CLEAR register */
 	mmio_write_32(MHU_BASE + SCP_INTR_S_CLEAR, 0xffffffffu);
 
-	bakery_lock_release(&mhu_secure_lock);
+	juno_lock_release(LOCK_ARG);
 }
 
 void mhu_secure_init(void)
 {
-	bakery_lock_init(&mhu_secure_lock);
+	juno_lock_init(LOCK_ARG);
 
 	/*
 	 * Clear the CPU's INTR register to make sure we don't see a stale
diff --git a/plat/juno/platform.mk b/plat/juno/platform.mk
index 6ca219d98204d6d779cd32c6e93442415ebec2e8..158e3ace354863b9f59090c0e947cde2d5bd7a86 100644
--- a/plat/juno/platform.mk
+++ b/plat/juno/platform.mk
@@ -66,7 +66,6 @@ BL1_SOURCES		+=	drivers/arm/cci400/cci400.c		\
 				plat/juno/aarch64/juno_common.c
 
 BL2_SOURCES		+=	drivers/arm/tzc400/tzc400.c		\
-				lib/locks/bakery/bakery_lock.c		\
 				plat/common/aarch64/platform_up_stack.S	\
 				plat/juno/bl2_plat_setup.c		\
 				plat/juno/mhu.c				\
diff --git a/plat/juno/tsp/tsp_plat_setup.c b/plat/juno/tsp/tsp_plat_setup.c
index 0a9d4cbe8bd850a15cc78764d9fabb81058c07c7..8293a13268d6b11f247243c19346d268cdced456 100644
--- a/plat/juno/tsp/tsp_plat_setup.c
+++ b/plat/juno/tsp/tsp_plat_setup.c
@@ -40,19 +40,25 @@
  ******************************************************************************/
 extern unsigned long __RO_START__;
 extern unsigned long __RO_END__;
+extern unsigned long __BL32_END__;
 
+#if USE_COHERENT_MEM
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
+#endif
 
 /*
- * The next 2 constants identify the extents of the code & RO data region.
- * These addresses are used by the MMU setup code and therefore they must be
- * page-aligned.  It is the responsibility of the linker script to ensure that
- * __RO_START__ and __RO_END__ linker symbols refer to page-aligned addresses.
+ * The next 3 constants identify the extents of the code, RO data region and the
+ * limit of the BL3-2 image.  These addresses are used by the MMU setup code and
+ * therefore they must be page-aligned.  It is the responsibility of the linker
+ * script to ensure that __RO_START__, __RO_END__ & __BL32_END__ linker symbols
+ * refer to page-aligned addresses.
  */
 #define BL32_RO_BASE (unsigned long)(&__RO_START__)
 #define BL32_RO_LIMIT (unsigned long)(&__RO_END__)
+#define BL32_END (unsigned long)(&__BL32_END__)
 
+#if USE_COHERENT_MEM
 /*
  * The next 2 constants identify the extents of the coherent memory region.
  * These addresses are used by the MMU setup code and therefore they must be
@@ -62,6 +68,7 @@ extern unsigned long __COHERENT_RAM_END__;
  */
 #define BL32_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL32_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
+#endif
 
 /*******************************************************************************
  * Initialize the UART
@@ -90,9 +97,12 @@ void tsp_platform_setup(void)
 void tsp_plat_arch_setup(void)
 {
 	configure_mmu_el1(BL32_RO_BASE,
-			  BL32_COHERENT_RAM_LIMIT - BL32_RO_BASE,
+			  (BL32_END - BL32_RO_BASE),
 			  BL32_RO_BASE,
-			  BL32_RO_LIMIT,
-			  BL32_COHERENT_RAM_BASE,
-			  BL32_COHERENT_RAM_LIMIT);
+			  BL32_RO_LIMIT
+#if USE_COHERENT_MEM
+			  , BL32_COHERENT_RAM_BASE,
+			  BL32_COHERENT_RAM_LIMIT
+#endif
+			  );
 }
diff --git a/services/std_svc/psci/psci_common.c b/services/std_svc/psci/psci_common.c
index 155f842e26fb6f41360061bd4e0230e608f1208d..0a1cdf9e54ec9421cf266884727aaa09a9c0338c 100644
--- a/services/std_svc/psci/psci_common.c
+++ b/services/std_svc/psci/psci_common.c
@@ -51,7 +51,10 @@ const spd_pm_ops_t *psci_spd_pm;
  * corresponds to an affinity instance e.g. cluster, cpu within an mpidr
  ******************************************************************************/
 aff_map_node_t psci_aff_map[PSCI_NUM_AFFS]
-__attribute__ ((section("tzfw_coherent_mem")));
+#if USE_COHERENT_MEM
+__attribute__ ((section("tzfw_coherent_mem")))
+#endif
+;
 
 /*******************************************************************************
  * Pointer to functions exported by the platform to complete power mgmt. ops
@@ -246,7 +249,8 @@ void psci_acquire_afflvl_locks(int start_afflvl,
 	for (level = start_afflvl; level <= end_afflvl; level++) {
 		if (mpidr_nodes[level] == NULL)
 			continue;
-		bakery_lock_get(&mpidr_nodes[level]->lock);
+
+		psci_lock_get(mpidr_nodes[level]);
 	}
 }
 
@@ -264,7 +268,8 @@ void psci_release_afflvl_locks(int start_afflvl,
 	for (level = end_afflvl; level >= start_afflvl; level--) {
 		if (mpidr_nodes[level] == NULL)
 			continue;
-		bakery_lock_release(&mpidr_nodes[level]->lock);
+
+		psci_lock_release(mpidr_nodes[level]);
 	}
 }
 
@@ -350,6 +355,10 @@ int psci_save_ns_entry(uint64_t mpidr,
  ******************************************************************************/
 unsigned short psci_get_state(aff_map_node_t *node)
 {
+#if !USE_COHERENT_MEM
+	flush_dcache_range((uint64_t) node, sizeof(*node));
+#endif
+
 	assert(node->level >= MPIDR_AFFLVL0 && node->level <= MPIDR_MAX_AFFLVL);
 
 	/* A cpu node just contains the state which can be directly returned */
@@ -407,6 +416,10 @@ void psci_set_state(aff_map_node_t *node, unsigned short state)
 		node->state &= ~(PSCI_STATE_MASK << PSCI_STATE_SHIFT);
 		node->state |= (state & PSCI_STATE_MASK) << PSCI_STATE_SHIFT;
 	}
+
+#if !USE_COHERENT_MEM
+	flush_dcache_range((uint64_t) node, sizeof(*node));
+#endif
 }
 
 /*******************************************************************************
diff --git a/services/std_svc/psci/psci_private.h b/services/std_svc/psci/psci_private.h
index 24a5604e7dcf4aff3e45e9f729fd6158f56a28d0..9a8ef73de4c12750156660247d09c179bce38586 100644
--- a/services/std_svc/psci/psci_private.h
+++ b/services/std_svc/psci/psci_private.h
@@ -33,14 +33,22 @@
 
 #include <arch.h>
 #include <bakery_lock.h>
-#include <platform_def.h>	/* for PLATFORM_NUM_AFFS */
 #include <psci.h>
 
-/* Number of affinity instances whose state this psci imp. can track */
-#ifdef PLATFORM_NUM_AFFS
-#define PSCI_NUM_AFFS		PLATFORM_NUM_AFFS
+/*
+ * The following helper macros abstract the interface to the Bakery
+ * Lock API.
+ */
+#if USE_COHERENT_MEM
+#define psci_lock_init(aff_map, idx)	bakery_lock_init(&(aff_map)[(idx)].lock)
+#define psci_lock_get(node)		bakery_lock_get(&((node)->lock))
+#define psci_lock_release(node)		bakery_lock_release(&((node)->lock))
 #else
-#define PSCI_NUM_AFFS		(2 * PLATFORM_CORE_COUNT)
+#define psci_lock_init(aff_map, idx)	((aff_map)[(idx)].aff_map_index = (idx))
+#define psci_lock_get(node)		bakery_lock_get((node)->aff_map_index,	  \
+						CPU_DATA_PSCI_LOCK_OFFSET)
+#define psci_lock_release(node)		bakery_lock_release((node)->aff_map_index,\
+						CPU_DATA_PSCI_LOCK_OFFSET)
 #endif
 
 /*******************************************************************************
@@ -49,10 +57,15 @@
  ******************************************************************************/
 typedef struct aff_map_node {
 	unsigned long mpidr;
-	unsigned short ref_count;
+	unsigned char ref_count;
 	unsigned char state;
 	unsigned char level;
+#if USE_COHERENT_MEM
 	bakery_lock_t lock;
+#else
+	/* For indexing the bakery_info array in per CPU data */
+	unsigned char aff_map_index;
+#endif
 } aff_map_node_t;
 
 typedef struct aff_limits_node {
diff --git a/services/std_svc/psci/psci_setup.c b/services/std_svc/psci/psci_setup.c
index e0bc8331393414f2849a6e34869ab0667852ba91..be504e819a85b651d2bf46d5a200b7bf5e6b9fbb 100644
--- a/services/std_svc/psci/psci_setup.c
+++ b/services/std_svc/psci/psci_setup.c
@@ -181,7 +181,7 @@ static void psci_init_aff_map_node(unsigned long mpidr,
 	uint32_t linear_id;
 	psci_aff_map[idx].mpidr = mpidr;
 	psci_aff_map[idx].level = level;
-	bakery_lock_init(&psci_aff_map[idx].lock);
+	psci_lock_init(psci_aff_map, idx);
 
 	/*
 	 * If an affinity instance is present then mark it as OFF to begin with.
@@ -331,13 +331,20 @@ int32_t psci_setup(void)
 					       afflvl);
 	}
 
+#if !USE_COHERENT_MEM
+	/*
+	 * The psci_aff_map only needs flushing when it's not allocated in
+	 * coherent memory.
+	 */
+	flush_dcache_range((uint64_t) &psci_aff_map, sizeof(psci_aff_map));
+#endif
+
 	/*
 	 * Set the bounds for the affinity counts of each level in the map. Also
 	 * flush out the entire array so that it's visible to subsequent power
-	 * management operations. The 'psci_aff_map' array is allocated in
-	 * coherent memory so does not need flushing. The 'psci_aff_limits'
-	 * array is allocated in normal memory. It will be accessed when the mmu
-	 * is off e.g. after reset. Hence it needs to be flushed.
+	 * management operations. The 'psci_aff_limits' array is allocated in
+	 * normal memory. It will be accessed when the mmu is off e.g. after
+	 * reset. Hence it needs to be flushed.
 	 */
 	for (afflvl = MPIDR_AFFLVL0; afflvl < max_afflvl; afflvl++) {
 		psci_aff_limits[afflvl].min =