A Rigorous Study of Deep Copy in PHP

In this post, we will go through the process of deep copying a PHP variable in user land (i.e., in pure PHP) step by step, describing the challenges facing every step, resolving them and going forward.

(TL;DR: check this Github gist for the final solution)

The Challenges

There are three particular challenges for solving the deep copy problem in PHP:

  1. Some things simply can not be copied (e.g., resources like file handles, and objects like MySQLi instances)
  2. References are invisible in PHP. They can only be set in PHP, and accessing them will automatically dereference them.
  3. Everything is deep copied by default in PHP, except objects. They are shallow copied.

Now we will tackle these challenges one by one. Let’s start with the one liner solution:

function deep_copy(&$variable) { return unserialize(serialize($variable)); }

This one-liner, is the best easy deep copy solution we can find. It deep copies objects, handles references and even circular references (as long as all referenced targets are available in $variable, otherwise their value will be used), and is pretty fast and straightforward.

The only problem with this method will be visible when one takes a look at the internal PHP code:

static void php_var_serialize_intern(smart_str *buf, zval *struc, php_serialize_data_t var_hash) /* {{{ */
{
	zend_long var_already;
	HashTable *myht;

	if (EG(exception)) {
		return;
	}

	if (var_hash && (var_already = php_add_var_hash(var_hash, struc))) {
		if (Z_ISREF_P(struc)) {
			smart_str_appendl(buf, "R:", 2);
			smart_str_append_long(buf, var_already);
			smart_str_appendc(buf, ';');
			return;
		} else if (Z_TYPE_P(struc) == IS_OBJECT) {
			smart_str_appendl(buf, "r:", 2);
			smart_str_append_long(buf, var_already);
			smart_str_appendc(buf, ';');
			return;
		}
	}

again:
	switch (Z_TYPE_P(struc)) {
		case IS_FALSE:
			smart_str_appendl(buf, "b:0;", 4);
			return;

		case IS_TRUE:
			smart_str_appendl(buf, "b:1;", 4);
			return;

		case IS_NULL:
			smart_str_appendl(buf, "N;", 2);
			return;

		case IS_LONG:
			php_var_serialize_long(buf, Z_LVAL_P(struc));
			return;

		case IS_DOUBLE: {
				char *s;

				smart_str_appendl(buf, "d:", 2);
				s = (char *) safe_emalloc(PG(serialize_precision), 1, MAX_LENGTH_OF_DOUBLE + 1);
				php_gcvt(Z_DVAL_P(struc), (int)PG(serialize_precision), '.', 'E', s);
				smart_str_appends(buf, s);
				smart_str_appendc(buf, ';');
				efree(s);
				return;
			}

		case IS_STRING:
			php_var_serialize_string(buf, Z_STRVAL_P(struc), Z_STRLEN_P(struc));
			return;

		case IS_OBJECT: {
				zval retval;
				zval fname;
				int res;
				zend_class_entry *ce = Z_OBJCE_P(struc);

				if (ce->serialize != NULL) {
					/* has custom handler */
					unsigned char *serialized_data = NULL;
					size_t serialized_length;

					if (ce->serialize(struc, &serialized_data, &serialized_length, (zend_serialize_data *)var_hash) == SUCCESS) {
						smart_str_appendl(buf, "C:", 2);
						smart_str_append_unsigned(buf, ZSTR_LEN(Z_OBJCE_P(struc)->name));
						smart_str_appendl(buf, ":\"", 2);
						smart_str_append(buf, Z_OBJCE_P(struc)->name);
						smart_str_appendl(buf, "\":", 2);

						smart_str_append_unsigned(buf, serialized_length);
						smart_str_appendl(buf, ":{", 2);
						smart_str_appendl(buf, (char *) serialized_data, serialized_length);
						smart_str_appendc(buf, '}');
					} else {
						smart_str_appendl(buf, "N;", 2);
					}
					if (serialized_data) {
						efree(serialized_data);
					}
					return;
				}

				if (ce != PHP_IC_ENTRY && zend_hash_str_exists(&ce->function_table, "__sleep", sizeof("__sleep")-1)) {
					ZVAL_STRINGL(&fname, "__sleep", sizeof("__sleep") - 1);
					BG(serialize_lock)++;
					res = call_user_function_ex(CG(function_table), struc, &fname, &retval, 0, 0, 1, NULL);
					BG(serialize_lock)--;
					zval_dtor(&fname);

					if (EG(exception)) {
						zval_ptr_dtor(&retval);
						return;
					}

					if (res == SUCCESS) {
						if (Z_TYPE(retval) != IS_UNDEF) {
							if (HASH_OF(&retval)) {
								php_var_serialize_class(buf, struc, &retval, var_hash);
							} else {
								php_error_docref(NULL, E_NOTICE, "__sleep should return an array only containing the names of instance-variables to serialize");
								/* we should still add element even if it's not OK,
								 * since we already wrote the length of the array before */
								smart_str_appendl(buf,"N;", 2);
							}
							zval_ptr_dtor(&retval);
						}
						return;
					}
					zval_ptr_dtor(&retval);
				}

				/* fall-through */
			}
		case IS_ARRAY: {
			uint32_t i;
			zend_bool incomplete_class = 0;
			if (Z_TYPE_P(struc) == IS_ARRAY) {
				smart_str_appendl(buf, "a:", 2);
				myht = Z_ARRVAL_P(struc);
				i = zend_array_count(myht);
			} else {
				incomplete_class = php_var_serialize_class_name(buf, struc);
				myht = Z_OBJPROP_P(struc);
				/* count after serializing name, since php_var_serialize_class_name
				 * changes the count if the variable is incomplete class */
				i = zend_array_count(myht);
				if (i > 0 && incomplete_class) {
					--i;
				}
			}
			smart_str_append_unsigned(buf, i);
			smart_str_appendl(buf, ":{", 2);
			if (i > 0) {
				zend_string *key;
				zval *data;
				zend_ulong index;

				ZEND_HASH_FOREACH_KEY_VAL_IND(myht, index, key, data) {

					if (incomplete_class && strcmp(ZSTR_VAL(key), MAGIC_MEMBER) == 0) {
						continue;
					}

					if (!key) {
						php_var_serialize_long(buf, index);
					} else {
						php_var_serialize_string(buf, ZSTR_VAL(key), ZSTR_LEN(key));
					}

					/* we should still add element even if it's not OK,
					 * since we already wrote the length of the array before */
					if ((Z_TYPE_P(data) == IS_ARRAY && Z_TYPE_P(struc) == IS_ARRAY && Z_ARR_P(data) == Z_ARR_P(struc))
						|| (Z_TYPE_P(data) == IS_ARRAY && Z_ARRVAL_P(data)->u.v.nApplyCount > 1)
					) {
						smart_str_appendl(buf, "N;", 2);
					} else {
						if (Z_TYPE_P(data) == IS_ARRAY && ZEND_HASH_APPLY_PROTECTION(Z_ARRVAL_P(data))) {
							Z_ARRVAL_P(data)->u.v.nApplyCount++;
						}
						php_var_serialize_intern(buf, data, var_hash);
						if (Z_TYPE_P(data) == IS_ARRAY && ZEND_HASH_APPLY_PROTECTION(Z_ARRVAL_P(data))) {
							Z_ARRVAL_P(data)->u.v.nApplyCount--;
						}
					}
				} ZEND_HASH_FOREACH_END();
			}
			smart_str_appendc(buf, '}');
			return;
		}
		case IS_REFERENCE:
			struc = Z_REFVAL_P(struc);
			goto again;
		default:
			smart_str_appendl(buf, "i:0;", 4);
			return;
	}
}
/* }}} */

As you can see in lines 179-180 of the above code snippet, everything that can not be serialized is simply replaced by an int=0 equivalent. Plus, __sleep, __wakeup and custom serialization handlers are called on objects, which might have side-effects. But the major issue is the default int=0 replacement, which will mask all resources and unserializable objects.

This is very unfortunate, because otherwise this one liner solution would save us a few hours of headaches, as will follow. Keep in mind that we can’t use “partial serialization” and stitch them together, replacing resources with their actual variables, because serialization only handles references correctly if all of them are contained in one variable.

Keep this C code in mind, we will return to it later to steal some ideas.

The Object Challenge

 

If you copy a primitive PHP type like $a=$b, PHP will internally create a new ZVAL and copy $b to $a (ZVALs, aka Zend Values are C structures equivalent to a PHP variable, having the ability to store all PHP types, from ints and floats to arrays and objects and resources).

This also works fine when copying arrays, $a=$b will iterate through the array and copy each value from $b to $a, with the same key. And if subarrays are available, since they are copied too, this process will repeat for all of them, resulting in a perfect deep copied array (unless references, keep reading).

Oddly this is somewhat different for objects. Still, using $a=$b for objects will copy the ZVAL, but the ZVAL is not the object itself here, but instead the pointer-to-object. So for example the code $o=new stdClass; $o->a='a'; will first create a new object, then create a ZVAL and put the object pointer in it, then point $o to that ZVAL.  (The Symbol Table, i.e., the list of variables in PHP, is itself a ZVAL, an associative array, with keys representing names of variables and values representing their ZVAL; so here $o points to that ZVAL means that an entry is added to Symbol Table ZVAL, named $o, that points to the object pointer ZVAL.)

Thus, when you copy that object by assignment, e.g.,

$o2=$o;

you are basically copying the object-pointer ZVAL. The actual object’s ZVAL will now have a refCount of 2 (A refCount, is an integer in the ZVAL structure that defines how many variables are pointing to it. When it reaches 0, the ZVAL is garbage collected). You can observe all of this using the following code snippet:

 

a='a';
$o2=$o;
$o2->a.='b';
debug_zval_dump($o2);
//#outpus:
// object(stdClass)#1 (1) refcount(3){
//   ["a"]=>
//   string(2) "ab" refcount(1)
// }

You might ask why the refCount is 3 instead of 2? Well, that’s because when calling debug_zval_dump, we are sending a copy of $o2 to that function (function parameters are by value, i.e., they are copied into the function symbol table). That’s why in our original one liner example above, and in all following examples, deep_copy() will have &$variable as argument, to prevent an implicit native copying on function call.

At first, this might confuse one into saying that object are passed and assigned by reference in PHP. That is not the case, and will cause so much ambiguity going forward if not straightened. Objects are stored in an object pool, and their pointers are passed around in PHP, just like many other high level languages. These pointers can be copied, but they are not references.

References on the other hand, in PHP, are variable aliases. For example, $a=&$b simply means that $a is the same as $b, it’s just a different name. If you change either, the same underlying variable will change. If you unset one, the other one and the value will remain. To make it more clear, consider the following code:

function f(&$t)
{
  $a=5;
  $t=&$a;
  $t++;
}
$b=1;
f($b);
echo $b,PHP_EOL; //outputs 1

The code above outputs one. Let’s investigate why. $b is passed to function by reference, that means $t inside the function is a reference to $b, an alias. Then inside the function $t, an alias of $b (the same as $b), is totally changed, it is re-alliased to $a. This means that $t is totally disconnected from $b, and is now the same as $b. From now forward, function has no way of accessing $b.

What this means is that $a = &$b should more accurately be written as $a =& $b. The assign by reference operation is a totally different one than assignment in PHP. It is not related to assignment by any means. It only accepts variables and creates an alias-like relationship between them. It does not actually copy anything. It does not return the reference to $b, and then assign it to $a, that’s why you can’t do f(&$x), reference of a variable is non-existent in high-level PHP, only variable aliases are meaningful.

Well, that’s a little bad. Because later on, we have no way of telling whether something is an reference/alias or not. Once a reference-assignment is done, those two variables are exactly the same to the eye of the PHP code. There is no way to even know that these two are referring to the same underlying ZVAL. And that’s problematic when copying things, because if we deep copy both, we end up with two ZVALs instead of one in our deep copy, and if we shallow copy both, we end up with zero new ZVALs in our deep copy, the deep copy will just be another instance of our original copy!

The following figure tries to illustrate this point:

php-references-and-objects

 

As visible, $a and $b are two different variables, both pointing to object 1, and $c is the same variable as $a. Modifying $b, will not modify object 1, unless it is dereferenced first with -> operator. For example, $b=5 will not modify object 1, it will just change pointer 2 to int(5). That’s why you can’t change variables in a function, if you don’t send them by reference, because a copy will be sent, but you can modify object properties inside a function, because a copy pointer to the same object will be sent, and the same object can be accessed using that pointer. Remember that changing $c in the above figure, changes $a too.

The conclusion is that object assignment is not the same as reference assignment in PHP, and is also not the same as other types of assignments, to the PHP code eye.

But, the unfortunate thing is that, no matter which one of $a, $b or $c you access in the above example, they will all look exactly the same to you. If you are given three variables, you have virtually no way of knowing their difference, they will all automatically dereference to object 1 once used.

Thus, to deep copy an array of objects, we can’t deep copy every entry, and we can’t shallow copy every entry either. We need to deep copy (i.e., clone for objects) some, and we need to shallow copy some others. In our figure above, we need to deep copy $a, we need to assign $b to the same object (shallow copy) and we need to reference copy $c (addressed in the next section).

This means that we need a means of detecting if two variables are the same object or not. After careful consideration of all PHP features, the only viable high-level solution is using SPL (Standard PHP Library). SPL has two entities that can help us, one is spl_object_hash which is bound to return a unique string for each internal object at a given time in script’s life, and can be used to check two variables and know whether they point to the same object or not, and also SplObjectStorage class which is an object set, only storing one copy of each object (think of it as a PHP array with objects for keys). They are the same concept, and we use the first one in our deep_copy example:

 

$v) 
			$res[$k]=deep_copy($variable[$k],$object_pool,$depth+1); 
	} 
	elseif (is_object($variable)) 
	{ 
		$hash=spl_object_hash($variable); 
		if (isset($object_pool[$hash])) 
			$res=$object_pool[$hash]; 
		else 
			$res=$object_pool[$hash]=clone $variable; 
	} 
	else 
		$res=$variable; //implicit copy for all other types 
	return $res; 
} 

This version of deep copy will work perfectly on copying $a=[$a,$b] from the image above, resulting in one clone of the underlying object, and two assignments to the cloned object, i.e., a perfect deep copy.

Run it on a large enough array/object, and you will not be so happy with it causing a few errors. It appears that you can’t clone all objects in PHP, a great many internal objects are uncloneable. That’s only natural though, some objects have states that go low level, under the skin of PHP, and the developers simply did not implement them cloneable, such as MySQLi.

The next version, adds support for uncloneable objects, by simply using the shallow-copy operator on them and hoping for the best. At least, this will not cause errors, but in some cases might not end up in a perfect deep copy, because the underlying state will not be copied; but since there is practically no way to even access that underlying state, this is as good as it gets:

 
$v) 
			$res[$k]=deep_copy($variable[$k],$object_pool,$depth+1); 
	} 
	elseif (is_object($variable)) 
	{ 
		$hash=spl_object_hash($variable); 
		if (isset($object_pool[$hash])) 
			$res=$object_pool[$hash]; 
		else 
		{ 
			$reflection=new ReflectionObject($variable); 
			if ($reflection->isCloneable()===false) 
				$res=$object_pool[$hash]=$variable; 
			else 
				$res=$object_pool[$hash]=clone $variable; 
		} 
	} 
	else 
		$res=$variable; //implicit copy for all other types 
	return $res; 
} 

This version uses Reflection to determine whether an object is cloneable or not, and deep copies only those that are.
Keep in mind that we do not iterate over object properties, deep copying sub-objects as well, because that undermines encapsulation principles. If an object is coded properly, it will automatically clone its sub-objects using __clone magic method itself.

Also note that $object_pool is maintained per original invocation of deep_copy, and is not universal to undermine garbage collection as well as making multiple deep copies of one object. It is also not defined as static to be thread-safe.

The Array Circular Reference Challenge

References only need to be handled array elements. Directly deep copying an object reference, e.g., $a=&$o; $a2=deep_copy($a); should behave similar to deep copying an actual object, e.g., $a2=deep_copy($o);, and deep copying a variable reference will simply replace the older copy, not posing any risks, e.g., $a=5; $b=&$a; $list=[$a,$b]; $list2=deep_copy($list); will simply copy 5 to a new $a, and then copy it again to $a via $b reference, doing a useless but risk-free copy.

PHP allows array elements to be references too, and this referencing can be observed in var_dump output as well. For example, observe the following piece of code:

$b=5;
$c='c';
$a=[&$b,&$c,&$c];
var_dump($a);
///outputs:
// array(2) {
//   [0]=>
//   &int(5)
//   [1]=>
//   &string(1) "c"
//   [1]=>
//   &string(1) "c"
// }

This is the only place that our deep copy will fail. If we simply deep copy $a from the example above, we are creating two copies of $c, because we have no means of knowing whether they were references or not. Our first step is to recognize references, and stop recopying them:

isCloneable()===false) 
				$res=$object_pool[$hash]=$variable; 
			else 
				$res=$object_pool[$hash]=clone $variable; 
		} 
	} 
	else 
		$res=$variable; //implicit copy for all other types 
	return $res; 
} 

Since we receive arrays by reference, we can add a new temporary entry to them, to know later on whether we have checked them or not. If we meet a previously visited array, we simply return its reference, and continue on. This new implementation is also able to deep copy arrays with circular referencing, since every element will be copied only once! However, the circular copy will not be correctly referenced itself. Running the following test, we get this result:

$a=[];
$a['data']='first data';
$a['a']=&$a;
$a2=deep_copy($a);
var_dump($a);
echo str_repeat("-",10),PHP_EOL;
var_dump($a2);
$a2['data']='copy';
$a['data']='original';
echo str_repeat("-",80),PHP_EOL;
var_dump($a);
echo str_repeat("-",10),PHP_EOL;
var_dump($a2);
Output:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
----------
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(1) {
    ["data"]=>
    string(10) "first data"
  }
}
--------------------------------------------------------------------------------
array(2) {
  ["data"]=>
  string(8) "original"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(8) "original"
    ["a"]=>
    *RECURSION*
  }
}
----------
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  array(1) {
    ["data"]=>
    string(10) "first data"
  }
}

As you can see, at first the deep copy seems to be the same as the original copy, just without the recursion, but then when we change the data part of it, we see that only one instance changes. That is expected though, because if $a2['a'] was a reference to $a2 itself, it would be circular (hence *RECURSION*) as well, but it is obviously not.

The reason is rather simple. On line 8 of version 2.0 code, we return the reference to the visited array, but function return automatically dereferences the result, creating a copy of it and returning the copy. That’s why $a2['a'] is a copy of $a2 instead of a reference to it.

Resolving function return dereferencing is pretty easy, just have the function return a reference. In our example, since all returns are variables, this is as easy as adding a & before deep_copy on line 2. Let’s run the same test on version 2.1 of the code:

isCloneable()===false)
				$res=$object_pool[$hash]=$variable;
			else
				$res=$object_pool[$hash]=clone $variable;
		}
	}
	else
		$res=$variable; //implicit copy for all other types

	return $res;
}

$a=[];
$a['data']='first data';
$a['a']=&$a;
$a2=deep_copy($a);
var_dump($a);
echo str_repeat("-",10),PHP_EOL;
var_dump($a2);
$a2['data']='copy';
$a['data']='original';
echo str_repeat("-",80),PHP_EOL;
var_dump($a);
echo str_repeat("-",10),PHP_EOL;
var_dump($a2);
Outputs:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
----------
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
array(2) {
  ["data"]=>
  string(8) "original"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(8) "original"
    ["a"]=>
    *RECURSION*
  }
}
----------
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}

Yay! We got the circular reference in place. But although $a2 is circularly referenced by itself, “copy” from the test is only set to first data. This is weird. Let’s investigate this further, by running another test on our code:


Output:
test1:
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
test2:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(4) "copy"
    ["a"]=>
    array(2) {
      ["data"]=>
      string(10) "first data"
      ["a"]=>
      *RECURSION*
    }
  }
}
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
test3:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    array(2) {
      ["data"]=>
      string(10) "first data"
      ["a"]=>
      array(2) {
        ["data"]=>
        string(4) "copy"
        ["a"]=>
        array(2) {
          ["data"]=>
          string(10) "first data"
          ["a"]=>
          *RECURSION*
        }
      }
    }
  }
}
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    array(2) {
      ["data"]=>
      string(4) "copy"
      ["a"]=>
      array(2) {
        ["data"]=>
        string(10) "first data"
        ["a"]=>
        *RECURSION*
      }
    }
  }
}
--------------------------------------------------------------------------------
The original array:
array(2) {
  ["data"]=>
  string(8) "original"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(8) "original"
    ["a"]=>
    *RECURSION*
  }
}

Study the three tests above, see what's different about them. Each one changes $a2['data'] at some recursion level, and then var_dumps both $a2['data'] and $a['a']['data']. By observing the outputs, it is clear that recursion and referencing is assigned properly, but values are not. Observing the few lines of code in our deep_copy function, we realize that non-reference values of arrays are copied twice, once at the most outer else, where $res=$variable, and also once in the foreach loop on line 14.

Well, let's make the assignment on line 14 by reference. Keep in mind that the outer-most else's assignment, the implicit copy of all types, can not be by reference otherwise we won't be copying anything. Adding a simple & to assignment of line 14 will result in the following output:

Output of version 2.2: circular references with once copying
test1:
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(10) "first data"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
test2:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(4) "copy"
    ["a"]=>
    *RECURSION*
  }
}
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(4) "copy"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
test3:
array(2) {
  ["data"]=>
  string(10) "first data"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(4) "copy"
    ["a"]=>
    *RECURSION*
  }
}
array(2) {
  ["data"]=>
  string(4) "copy"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(4) "copy"
    ["a"]=>
    *RECURSION*
  }
}
--------------------------------------------------------------------------------
The original array:
array(2) {
  ["data"]=>
  string(8) "original"
  ["a"]=>
  &array(2) {
    ["data"]=>
    string(8) "original"
    ["a"]=>
    *RECURSION*
  }
}

Very interesting! It seems that $a2['a'] and all deeper recursions are the same ZVAL, whereas $a2 itself is another ZVAL! Basically, instead of creating one ZVAL and referencing it from itself, we are somehow creating two ZVALs, referencing the second one inside the first one, and referencing the second one inside the second one. We are unwrapping one level of the recursion, unknowingly.

Looking at the code carefully to find all instances where copying is made, and testing different scenarios, we figure out that the issue lies in the way we call deep_copy. In fact, if we do another call to deep_copy, like $a2=deep_copy($a); $a3=deep_copy($a2);, then no matter what we do to $a2, it won't change $a or $a3. The copying is done when calling deep_copy.

In PHP, to retain the reference returned by a reference-returning function in a variable, you must assign by reference that variable. To properly call deep_copy, we should do $a2=& deep_copy($a);, and that will not unwrap anything.

Alright! So now we have a very nice deep copy solution. Let's test it a little further. The following test is the first one that comes to mind, making circular referencing a tad bit more complicated:

$a=['ref'=>&$a];
$a['val']=$a;
var_dump($a);
$a2=&deep_copy($a);
var_dump($a2);
/**The first var_dump outputs:
array(2) {
  ["ref"]=>
  &array(2) {
    ["ref"]=>
    *RECURSION*
    ["val"]=>
    *RECURSION*
  }
  ["val"]=>
  array(2) {
    ["ref"]=>
    *RECURSION*
    ["val"]=>
    *RECURSION*
  }
}*/

This test fails with an infinite recursion. However, do not lose hope. We will return to this test later. Before leaving it though, let's give it a name: The Nasty Test.

Reference Identification

So far we were able to (1) detect references to arrays (in arrays), (2) identify the target copy array and (3) recreate the reference in our deep copy to the target copy array.

This was a very important concept and step. However, we only did this for arrays (in arrays), and we do not yet support that for other types (in arrays). Basically, the following example can not be distinguished using our deep copy solution:


  int(5)
  [1]=>
  &int(5)
  [2]=>
  int(5)
}
array(3) {
  [0]=>
  int(5)
  [1]=>
  int(5)
  [2]=>
  int(5)
}
*/

We could detect references because we would touch an array once visited, and detect that touch if the same array was used later, implying a reference to the same array. We could identify that array because of line 12 of the deep_copy function, because the "visit" would reference back to the copied version (instead of making multiple copies). But we have none of those luxuries for other data types, because we can not touch them and mark them as visited, thus we are unable to understand whether they are by reference or not (1) and even if we could, we would be unable to identify what zval (the underlying PHP variable) they both are referring to (identification) (2), to be able to recreate that structure in our deep copy (3).

Let's tackle the problem from a whole different angel. We will rewrite the code in a way that assumes we have functions for doing (1) and (2), and then use them to perform (3).
Algorithmically, we need to copy everything, except when it is by reference, in which case we need to copy it only once and then reference it all other times. To reference it later, we need to know whether two references point to the same thing, hence Reference Identification.

Consider the following code:

$v)
		{
			if (is_ref($variable[$k])) //byref
				$res[$k]=&deep_copy($variable[$k],$object_pool,$zval_pool,$depth+1);
			else //copy
				$res[$k]=deep_copy($variable[$k],$object_pool,$zval_pool,$depth+1);
		}
		return $res;
	}	
	elseif (is_object($variable))
	{
		$hash=spl_object_hash($variable);
		if (isset($object_pool[$hash]))
			$res=$object_pool[$hash];
		else
		{
			$reflection=new ReflectionObject($variable);
			if ($reflection->isCloneable()===false)
				$res=$object_pool[$hash]=$variable;
			else
				$res=$object_pool[$hash]=clone $variable;
		}
		$zval_pool[$id]=&$res;
		return $res;
	}
	else
	{
		$zval_pool[$id]=$variable; //copy
		return $zval_pool[$id];
	}
}

The idea behind this piece of code is that, deep_copy should copy any zval, if it is not visited before (lines 33-34, 38-39). Once an already copied zval is seen, we should just return the previous copy, instead of re-copying it (lines 4-6).
Now if the zval was inside an array, and was by reference, we want to maintain that reference, by referencing it to the one copy of the zval in our $zval_pool (line 14), otherwise, we will just separate and copy it again (line 16).

This is basically what serialize does for us, for references, it stores the index (id) of that zval in the serialized zval_pool prefixed with "R:".


Theoretically, deep copy 3.0 should work fine. Unfortunately we can not test it that easily, because it relies on two PHP functions that are non-existent, and very hard to mimic: is_ref and zval_id. But it is worth implementing them inside PHP just to make sure that this deep copy algorithm is worthwhile. Here is the C code required to implement these two functions:

ZEND_BEGIN_ARG_INFO_EX(phpx_byref_arginfo, 
    1 /*pass_rest_by_reference*/, 
    0/*return_reference*/,
    1/*required_num_args*/)
ZEND_ARG_PASS_INFO(1/*by_ref*/)
ZEND_END_ARG_INFO();

PHP_FUNCTION(zval_id)
{
    //computes the address of first zval sent to us, 
    //and the rest receive id relative to that.
    zval *z;
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC,"z",&z)==FAILURE)
        return;
    long id=*(( long*)(&z->value)); //return_value will become the address
    static long base=0;
    if (base==0) base=id-1; 
    id=(id^base)>>3; //zval is at least 16 bytes
    RETURN_LONG(id);
}
PHP_FUNCTION(is_ref)
{
    zval *z;
    int res;
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC,"z",&z)==FAILURE)
        return;
    RETURN_BOOL(Z_REFCOUNT_P(z) > 2); 
    //1 is the reference sent to this function, the other is the actual var. 
    //if more, reference!
}

ZEND_BEGIN_ARG_INFO_EX is a needed definition because both zval_id and is_ref, need to receive the actual variable instead of a copy of it (hence byref parameters). Even receiving a reference is a little problematic, as we will see shortly.zval_id receives any variable, and basically returns the memory address to its value. This implementation is PHP 7 specific. To simplify those addresses and make them more similar to indices, the first zval sent to the function is stored as the baseline and other addresses are indexed relative to that.

is_ref basically checks whether refcount of a zval is bigger than 2 or not. A refcount of 2 means that this variable has two references, one of them being the original variable, and the second being the reference sent to is_ref. If the refcount is larger than 2, it means that there is some other reference to this variable somewhere else, hence making them all references to the original value.

A few tests show that these two functions behave as expected. Now lets see if our deep copy 3.0 function works as expected:

data="o data";
$oref=&$o;
$ocopy=$o;
$v="value";
$vcopy=$v." copy";
$vref=&$v;
$a=[$o,$oref,$ocopy,$v,$vcopy,$vref
,&$o,&$oref,&$ocopy,&$v,&$vcopy,&$vref];
$a2=&deep_copy($a);
$a2[0]->data.=" (deep_copy)";
$a2[3].=" (deep_copy)";
$a2[4].=" (deep_copy)";

$a[0]->data.=" (original)";
$a[3].=" (original)";
$a[4].=" (original)";
var_dump($a);
echo str_repeat("-",80),PHP_EOL;
var_dump($a2);

Output of the following code will be:

array(12) {
  [0]=>
  object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [1]=>
  object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [2]=>
  object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [3]=>
  string(16) "value (original)"
  [4]=>
  string(21) "value copy (original)"
  [5]=>
  string(5) "value"
  [6]=>
  &object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [7]=>
  &object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [8]=>
  &object(stdClass)#1 (1) {
    ["data"]=>
    string(17) "o data (original)"
  }
  [9]=>
  &string(5) "value"
  [10]=>
  &string(10) "value copy"
  [11]=>
  &string(5) "value"
}
array(12) {
  [0]=>
  object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [1]=>
  object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [2]=>
  object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [3]=>
  string(17) "value (deep_copy)"
  [4]=>
  string(22) "value copy (deep_copy)"
  [5]=>
  string(5) "value"
  [6]=>
  &object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [7]=>
  &object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [8]=>
  object(stdClass)#3 (1) {
    ["data"]=>
    string(18) "o data (deep_copy)"
  }
  [9]=>
  &string(5) "value"
  [10]=>
  string(10) "value copy"
  [11]=>
  &string(5) "value"
}

As you can see, the test shows that everything is working as expected. The reason for [8] and [10] not being a references in the deep copy is that we do not have $ocopy in our zval_pool, but instead we have a copy of it, thus it is only seen once.

The next step would be to implement is_ref and zval_id in pure PHP! Lets push our luck.

Reference Identification, in Pure PHP

zval_id is easier to do in PHP. I will draft two version, one light and one accurate, and then end up using the accurate version:


This version can only detect ids for arrays, and returns a new id for each other data type, resulting in copying references. It also leaves traces of visiting nodes. Doesn't sound very good, but is actually a worthy solution for most scenarios, because it is very fast.

Very rarely in PHP applications, non-object non-array variables are referenced. Just ignoring such references will be a close approximation in a light, fast deep copy solution. Also, referencing objects is very slightly different from just copying their pointers, so this will also be a very good approximation. We also need a final traversal of the deep copy to remove visitation tags.

Let's draft out a more accurate but slower solution:

$v)
		if ($v==='___VISITED___')
		{
			$id=$k;
			break;
		}
	if (!isset($id))
		$id=$ids++;
	$zval=$backup;
	$zvals[$id]=&$zval; 
	return $id;
}

This new version stores every zval sent to it in a pool, assigning each one a new id. Once a zval is sent for identification, it first
creates a backup of this new zval, then modifies the zval and observes if any of the zvals in the pool change with it (thus, a reference), and if one is found, returns the id of that zval.

The only drawback of this solution, besides its O(n) running time, is possible side effects. PHP has a lot of magic in it, and backing up and changing variables can cause a lot of troubling side-effects. Fortunately, most of our basic tests and use-cases will not face the wrath of these side-effects.

Note that we could just return the id once we found it in the foreach loop, instead of running the rest of the function regardless, to achieve slightly better performance.

Also, this function has an internal state, so it is neither thread safe nor multi-use safe. It grabs a reference to all visited zvals and keeps them indefinitely, which is not a great idea. An alternative would be to receive the pool as input:

$v)
		if ($v==='___VISITED___')
		{
			$id=$k;
			break;
		}
	$zval=$backup;
	if (!isset($id))
	{
		$id=count($zvals);
		$zvals[$id]=&$zval;
	}
	return $id;
}

This is a safer zval_id, one that does not leak memory, and can be used in multiple places in one application. However, this zval_id implementation requires us to send a second parameter, the pool, although no extra code is needed, except changing the pool variable in different contexts.

Now lets try implementing is_ref. Obviously we can rely on zval_id and maintaining a pool of all visited zvals to spot references, by finding more than 1 zval with the same id in our pool. But a hackier, quicker solution will be my personal choice.

We know that two PHP functions, namely var_dump and debug_zval_dump show a tiny & when printing references inside an array. Thus, we can use them to spot references inside any array. Fortunately, whenever we need is_ref in our deep copy function, we also have the container array. Lets see a draft:

$v)
		if ($k!==$varname)
			unset($container[$k]);
	ob_start();
	var_dump($container);
	$dump=ob_get_clean();
	if (($r=strpos($dump,"=>\n  &"))===false)
		return false;
	else 
	{
		if ($r>=15+strlen($varname) /*array(_) {\n  [_]*/
			and $r<20+strlen($varname))
			return true;
	}
	return false;
}

To reduce confusion, this is_ref implementation first strips the container array (which is of course a copy due to being a byval parameter) of all elements except $varname, which holds the name of the interesting variable.

Then it starts output buffering, var_dumps the container, and inspects the result for ampersand at a specific position. The failback is always to assume it is not a reference, as explained above.

Although we will be fine using this is_ref implementation for our deep copy, lets push ourselves a little further and create a general purpose is_ref in pure PHP! The issue is that we would not have the container array, and our hack does not work for individual zvals. Fortunately, get_defined_vars() in PHP allows us to obtain the current symbol table (i.e., container of all variables). Unfortunately though, get_defined_vars() should be executed in the callee scope and not the caller scope.

I was unable to find a workaround about that, but a possible, dirty solution is to force the callee to send get_defined_vars() as a parameter to is_ref:


This is_ref implementation, discovers the variable by grepping the source code of the callee for a variable name, in the designated spot, thus will fail when any other reference is sent, like an ArrayDimFetch (e.g., $something[2]). It also forces the first parameter sent to it to be get_defined_vars() by inspecting the callee code. Then sends the container and the variable name to our previous is_ref implementation for a decisive answer!

Testing the new implementations in pure PHP gives us some confidence, because we receive the exact same deep copies as using the extension function.

Resources and Uncloneables

Since we have no way of effectively deep copying Uncloneables and Resources, except ad-hoc solutions for each uncloneable object and resource, we should at least warn the user of deep copy when such instances are observed, so that they can expect side-effects happening from the deep copy (because the deep copy will have the same resource/uncloneable as the original copy).

Adding this warning to the deep copy function, we end up with the following deep copy function:

$v)
		{
			$t=&deep_copy($variable[$k],$object_pool,$zval_pool,$depth+1,$id_zvals);
			if (function_exists("is_ref"))
				if (is_ref($variable[$k]))
					$res[$k]=&$t;
				else
					$res[$k]=$t;
			else //php-based is_ref
				if (_is_ref($variable,$k))
					$res[$k]=&$t;
				else
					$res[$k]=$t;
		}
		return $res;
	}	
	elseif (is_object($variable))
	{
		$hash=spl_object_hash($variable);
		if (isset($object_pool[$hash]))
			$res=$object_pool[$hash];
		else
		{
			$reflection=new ReflectionObject($variable);
			if ($reflection->isCloneable()===false)
			{
				trigger_error("Attempting to deep copy an unclonable object (depth={$depth})");	
				$res=$object_pool[$hash]=$variable;
			}
			else
				$res=$object_pool[$hash]=clone $variable;
		}
		$zval_pool[$id]=&$res;
		return $res;
	}
	else
	{
		if (is_resource($variable))
			trigger_error("Attempting to deep copy a resource of type '".get_resource_type($variable)."' (depth={$depth})");	
		$zval_pool[$id]=$variable; //copy
		return $zval_pool[$id];
	}
}

This 55 line deep copy solution is the best I could come up with. Of course it relies on is_ref and zval_id implementations, each of which is a few lines of code themselves. Apparently, zval_id can be inlined into this function, requiring one less pool to preserve, but I'd rather not add that confusion.

You can access the final code with a few test cases in this Github gist.

Leave a reply:

Your email address will not be published.

Site Footer

Sliding Sidebar