diff --git a/compiler/rustc_ty_utils/src/abi.rs b/compiler/rustc_ty_utils/src/abi.rs
index f23c2cf2c07..536bcc0cd9e 100644
--- a/compiler/rustc_ty_utils/src/abi.rs
+++ b/compiler/rustc_ty_utils/src/abi.rs
@@ -726,6 +726,49 @@ fn fn_abi_adjust_for_abi<'tcx>(
                 };
             }
 
+            if arg_idx.is_none() && arg.layout.size > Pointer(AddressSpace::DATA).size(cx) * 2 {
+                // Return values larger than 2 registers using a return area
+                // pointer. LLVM and Cranelift disagree about how to return
+                // values that don't fit in the registers designated for return
+                // values. LLVM will force the entire return value to be passed
+                // by return area pointer, while Cranelift will look at each IR level
+                // return value independently and decide to pass it in a
+                // register or not, which would result in the return value
+                // being passed partially in registers and partially through a
+                // return area pointer.
+                //
+                // While Cranelift may need to be fixed as the LLVM behavior is
+                // generally more correct with respect to the surface language,
+                // forcing this behavior in rustc itself makes it easier for
+                // other backends to conform to the Rust ABI and for the C ABI
+                // rustc already handles this behavior anyway.
+                //
+                // In addition LLVM's decision to pass the return value in
+                // registers or using a return area pointer depends on how
+                // exactly the return type is lowered to an LLVM IR type. For
+                // example `Option<u128>` can be lowered as `{ i128, i128 }`
+                // in which case the x86_64 backend would use a return area
+                // pointer, or it could be passed as `{ i32, i128 }` in which
+                // case the x86_64 backend would pass it in registers by taking
+                // advantage of an LLVM ABI extension that allows using 3
+                // registers for the x86_64 sysv call conv rather than the
+                // officially specified 2 registers.
+                //
+                // FIXME: Technically we should look at the amount of available
+                // return registers rather than guessing that there are 2
+                // registers for return values. In practice only a couple of
+                // architectures have less than 2 return registers. None of
+                // which supported by Cranelift.
+                //
+                // NOTE: This adjustment is only necessary for the Rust ABI as
+                // for other ABI's the calling convention implementations in
+                // rustc_target already ensure any return value which doesn't
+                // fit in the available amount of return registers is passed in
+                // the right way for the current target.
+                arg.make_indirect();
+                return;
+            }
+
             match arg.layout.abi {
                 Abi::Aggregate { .. } => {}
 
diff --git a/tests/codegen/i128-x86-align.rs b/tests/codegen/i128-x86-align.rs
index 3e6ed2b8e16..6b1da445c40 100644
--- a/tests/codegen/i128-x86-align.rs
+++ b/tests/codegen/i128-x86-align.rs
@@ -19,13 +19,15 @@ pub struct ScalarPair {
 #[no_mangle]
 pub fn load(x: &ScalarPair) -> ScalarPair {
     // CHECK-LABEL: @load(
+    // CHECK-SAME: sret([32 x i8]) align 16 dereferenceable(32) %_0,
     // CHECK-SAME: align 16 dereferenceable(32) %x
     // CHECK:      [[A:%.*]] = load i32, ptr %x, align 16
     // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr %x, i64 16
     // CHECK-NEXT: [[B:%.*]] = load i128, ptr [[GEP]], align 16
-    // CHECK-NEXT: [[IV1:%.*]] = insertvalue { i32, i128 } poison, i32 [[A]], 0
-    // CHECK-NEXT: [[IV2:%.*]] = insertvalue { i32, i128 } [[IV1]], i128 [[B]], 1
-    // CHECK-NEXT: ret { i32, i128 } [[IV2]]
+    // CHECK-NEXT: store i32 [[A]], ptr %_0, align 16
+    // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr %_0, i64 16
+    // CHECK-NEXT: store i128 [[B]], ptr [[GEP]], align 16
+    // CHECK-NEXT: ret void
     *x
 }
 
@@ -53,29 +55,23 @@ pub fn alloca() {
 #[no_mangle]
 pub fn load_volatile(x: &ScalarPair) -> ScalarPair {
     // CHECK-LABEL: @load_volatile(
+    // CHECK-SAME: sret([32 x i8]) align 16 dereferenceable(32) %_0,
     // CHECK-SAME: align 16 dereferenceable(32) %x
-    // CHECK:      [[TMP:%.*]] = alloca [32 x i8], align 16
     // CHECK:      [[LOAD:%.*]] = load volatile %ScalarPair, ptr %x, align 16
-    // CHECK-NEXT: store %ScalarPair [[LOAD]], ptr [[TMP]], align 16
-    // CHECK-NEXT: [[A:%.*]] = load i32, ptr [[TMP]], align 16
-    // CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
-    // CHECK-NEXT: [[B:%.*]] = load i128, ptr [[GEP]], align 16
+    // CHECK-NEXT: store %ScalarPair [[LOAD]], ptr %_0, align 16
+    // CHECK-NEXT: ret void
     unsafe { std::intrinsics::volatile_load(x) }
 }
 
 #[no_mangle]
 pub fn transmute(x: ScalarPair) -> (std::mem::MaybeUninit<i128>, i128) {
-    // CHECK-LABEL: define { i128, i128 } @transmute(i32 noundef %x.0, i128 noundef %x.1)
-    // CHECK:       [[TMP:%.*]] = alloca [32 x i8], align 16
-    // CHECK-NEXT:  store i32 %x.0, ptr [[TMP]], align 16
-    // CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
+    // CHECK-LABEL: @transmute(
+    // CHECK-SAME:  sret([32 x i8]) align 16 dereferenceable(32) %_0,
+    // CHECK-SAME:  i32 noundef %x.0, i128 noundef %x.1
+    // CHECK:       store i32 %x.0, ptr %_0, align 16
+    // CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds i8, ptr %_0, i64 16
     // CHECK-NEXT:  store i128 %x.1, ptr [[GEP]], align 16
-    // CHECK-NEXT:  [[LOAD1:%.*]] = load i128, ptr %_0, align 16
-    // CHECK-NEXT:  [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
-    // CHECK-NEXT:  [[LOAD2:%.*]] = load i128, ptr [[GEP2]], align 16
-    // CHECK-NEXT:  [[IV1:%.*]] = insertvalue { i128, i128 } poison, i128 [[LOAD1]], 0
-    // CHECK-NEXT:  [[IV2:%.*]] = insertvalue { i128, i128 } [[IV1]], i128 [[LOAD2]], 1
-    // CHECK-NEXT:  ret { i128, i128 } [[IV2]]
+    // CHECK-NEXT:  ret void
     unsafe { std::mem::transmute(x) }
 }
 
diff --git a/tests/codegen/tuple-layout-opt.rs b/tests/codegen/tuple-layout-opt.rs
index 601563bc061..3202e4749c5 100644
--- a/tests/codegen/tuple-layout-opt.rs
+++ b/tests/codegen/tuple-layout-opt.rs
@@ -19,28 +19,28 @@ pub fn test_ScalarZstFirst(_: ScalarZstFirst) -> ScalarZstFirst {
 }
 
 type ScalarPairZstLast = (u8, u128, ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairZstLast(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstLast(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstLast(_: ScalarPairZstLast) -> ScalarPairZstLast {
     loop {}
 }
 
 type ScalarPairZstFirst = ((), u8, u128);
-// CHECK: define {{(dso_local )?}}{ i8, i128 } @test_ScalarPairZstFirst(i8 %_1.0, i128 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairZstFirst(ptr sret([32 x i8]) align 16 %_0, i8 %_1.0, i128 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairZstFirst(_: ScalarPairZstFirst) -> ScalarPairZstFirst {
     loop {}
 }
 
 type ScalarPairLotsOfZsts = ((), u8, (), u128, ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairLotsOfZsts(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairLotsOfZsts(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLotsOfZsts(_: ScalarPairLotsOfZsts) -> ScalarPairLotsOfZsts {
     loop {}
 }
 
 type ScalarPairLottaNesting = (((), ((), u8, (), u128, ())), ());
-// CHECK: define {{(dso_local )?}}{ i128, i8 } @test_ScalarPairLottaNesting(i128 %_1.0, i8 %_1.1)
+// CHECK: define {{(dso_local )?}}void @test_ScalarPairLottaNesting(ptr sret([32 x i8]) align 16 %_0, i128 %_1.0, i8 %_1.1)
 #[no_mangle]
 pub fn test_ScalarPairLottaNesting(_: ScalarPairLottaNesting) -> ScalarPairLottaNesting {
     loop {}