% MONTE CARLO EXPERIMENT TO INVESTIGATE MISSING DATA & MULTICOLLINEARITY clc; % Parameters n = 100; loops = 10000; rho = 0.5; sigma = 2.5; beta = [1;2;3;4]; mc = 0; %multicollinearity md = 1; %missing data mr = 0; %missing at random nmiss = 30; %number of missing observations xmiss = 0; %replacement value for dummy-variable approach % Create Data x1 = ones(n,1); x2 = rand(n,1); x3a = 5 + rand(n,1) + rho*randn(n,1); x3b = 5 + x2 + rho*randn(n,1); x4 = randn(n,1); xmat1 = [x1,x2,x3a,x4]; xmat2 = [x1,x2,x3b,x4]; k = size(xmat1,2); % Correlation Coefficient vcv = cov(x2,x3b); rho23 = vcv(1,2)/sqrt(vcv(1,1)*vcv(2,2)); % Storage Matrices bmat = zeros(loops,3*k+1); stdmat = zeros(loops,3*k+1); tmat = zeros(loops,3*k+1); % *************** % Simulation Loop % *************** for i = (1:1:loops) % Data-Generating Process y = xmat2*beta + sigma*randn(n,1); % Multicollinearity if mc == 1; % OLS Regression & t Statistics w/ MC b = inv(xmat2'*xmat2)*(xmat2'*y); bmat(i,k+1:2*k) = b'; e = y - xmat2*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat2'*xmat2); stderr = sqrt(diag(varb)); stdmat(i,k+1:2*k) = stderr'; tmat(i,k+1:2*k) = (b./stderr)'; % OLS Regression & t Statistics w/out MC y = xmat1*beta + sigma*randn(n,1); b = inv(xmat1'*xmat1)*(xmat1'*y); bmat(i,1:k) = b'; e = y - xmat1*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat1'*xmat1); stderr = sqrt(diag(varb)); stdmat(i,1:k) = stderr'; tmat(i,1:k) = (b./stderr)'; end; % (Randomly) Missing Data if (md == 1) & (mr == 1); % Method #1: Listwise Deletion y3 = y(nmiss+1:n); xmat3 = xmat2(nmiss+1:n,:); n = size(y3,1); b = inv(xmat3'*xmat3)*(xmat3'*y3); bmat(i,1:k) = b'; e = y3 - xmat3*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat3'*xmat3); stderr = sqrt(diag(varb)); stdmat(i,1:k) = stderr'; tmat(i,1:k) = (b./stderr)'; % Method #2: Replace w/ Average n = size(y,1); x3c = [(mean(xmat2(nmiss+1:n,3))*ones(nmiss,1));xmat2(nmiss+1:n,3)]; xmat4 = [xmat2(:,1:2),x3c,xmat2(:,4)]; n = size(xmat4,1); b = inv(xmat4'*xmat4)*(xmat4'*y); bmat(i,k+1:2*k) = b'; e = y - xmat4*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat4'*xmat4); stderr = sqrt(diag(varb)); stdmat(i,k+1:2*k) = stderr'; tmat(i,k+1:2*k) = (b./stderr)'; % Method #3: Dummy-Variable Approach x3d = [xmiss*ones(nmiss,1);xmat2(nmiss+1:n,3)]; d = [ones(nmiss,1);zeros(n-nmiss,1)]; xmat5 = [xmat2(:,1:2),x3d,xmat2(:,4),d]; b = inv(xmat5'*xmat5)*(xmat5'*y); bmat(i,2*k+1:3*k+1) = b'; e = y - xmat5*b; s2 = (e'*e)/(n-k-1); varb = s2*inv(xmat5'*xmat5); stderr = sqrt(diag(varb)); stdmat(i,2*k+1:3*k+1) = stderr'; tmat(i,2*k+1:3*k+1) = (b./stderr)'; % (Systematically) Missing Data elseif (md == 1) & (mr == 0); % Sort Data data = [y,xmat2]; sortdata = sortrows(data,4); y = sortdata(:,1); xmat2 = sortdata(:,2:k+1); % Method #1: Listwise Deletion y3 = y(nmiss+1:n); xmat3 = xmat2(nmiss+1:n,:); n = size(y3,1); b = inv(xmat3'*xmat3)*(xmat3'*y3); bmat(i,1:k) = b'; e = y3 - xmat3*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat3'*xmat3); stderr = sqrt(diag(varb)); stdmat(i,1:k) = stderr'; tmat(i,1:k) = (b./stderr)'; % Method #2: Replace w/ Average n = size(y,1); x3c = [mean(xmat2(nmiss+1:n,3))*ones(nmiss,1);xmat2(nmiss+1:n,3)]; xmat4 = [xmat2(:,1:2),x3c,xmat2(:,4)]; n = size(xmat4,1); b = inv(xmat4'*xmat4)*(xmat4'*y); bmat(i,k+1:2*k) = b'; e = y - xmat4*b; s2 = (e'*e)/(n-k); varb = s2*inv(xmat4'*xmat4); stderr = sqrt(diag(varb)); stdmat(i,k+1:2*k) = stderr'; tmat(i,k+1:2*k) = (b./stderr)'; % Method #3: Dummy-Variable Approach x3d = [xmiss*ones(nmiss,1);xmat2(nmiss+1:n,3)]; d = [ones(nmiss,1);zeros(n-nmiss,1)]; xmat5 = [xmat2(:,1:2),x3d,xmat2(:,4),d]; b = inv(xmat5'*xmat5)*(xmat5'*y); bmat(i,2*k+1:3*k+1) = b'; e = y - xmat5*b; s2 = (e'*e)/(n-k-1); varb = s2*inv(xmat5'*xmat5); stderr = sqrt(diag(varb)); stdmat(i,2*k+1:3*k+1) = stderr'; tmat(i,2*k+1:3*k+1) = (b./stderr)'; end; end; % MSE Comparison if mc == 1; avgb = mean(bmat); avgvarb = mean(stdmat.^2); avgt = mean(tmat); mse = avgvarb(1:2*k) + (avgb(1:2*k) - [beta;beta]').^2; disp(['WITHOUT MULTICOLLINEARITY']); disp(' '); disp(['Correlation Coefficient (X2,X3) = ',num2str(rho23)]); disp(' '); disp([' b1 b2 b3 b4 ']); disp(num2str(avgb(1:k))); disp(' '); disp([' t1 t2 t3 t4 ']); disp(num2str(avgt(1:k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4 ']); disp(num2str(mse(1:k))); disp(' '); disp(' '); disp(['WITH MULTICOLLINEARITY']); disp([' b1 b2 b3 b4 ']); disp(num2str(avgb(k+1:2*k))); disp(' '); disp([' t1 t2 t3 t4 ']); disp(num2str(avgt(k+1:2*k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4 ']); disp(num2str(mse(k+1:2*k))); end if (md == 1) & (mr == 1); avgb = mean(bmat); avgvarb = mean(stdmat.^2); avgt = mean(tmat); mse = avgvarb + (avgb - [beta;beta;beta;0]').^2; disp(['RANDOMLY MISSING DATA']); disp(' '); disp(['METHOD #1: LISTWISE DELETION']); disp([' b1 b2 b3 b4 ']); disp(num2str(avgb(1:k))); disp(' '); disp([' t1 t2 t3 t4 ']); disp(num2str(avgt(1:k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4']); disp(num2str(mse(1:k))); disp(' '); disp(' '); disp(['METHOD #2: REPLACE W/ AVERAGE']); disp([' b1 b2 b3 b4']); disp(num2str(avgb(k+1:2*k))); disp(' '); disp([' t1 t2 t3 t4']); disp(num2str(avgt(k+1:2*k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4']); disp(num2str(mse(k+1:2*k))); disp(' '); disp(' '); disp(['METHOD #3: DUMMY-VARIABLE APPROACH']); disp([' b1 b2 b3 b4 b5']); disp(num2str(avgb(2*k+1:3*k+1))); disp(' '); disp([' t1 t2 t3 t4 t5']); disp(num2str(avgt(2*k+1:3*k+1))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4 MSE5']); disp(num2str(mse(2*k+1:3*k+1))); end if (md == 1) & (mr == 0); avgb = mean(bmat); avgvarb = mean(stdmat.^2); avgt = mean(tmat); mse = avgvarb + (avgb - [beta;beta;beta;0]').^2; disp(['SYSTEMATICALLY MISSING DATA']); disp(' '); disp(['METHOD #1: LISTWISE DELETION']); disp([' b1 b2 b3 b4']); disp(num2str(avgb(1:k))); disp(' '); disp([' t1 t2 t3 t4']); disp(num2str(avgt(1:k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4']); disp(num2str(mse(1:k))); disp(' '); disp(' '); disp(['METHOD #2: REPLACE W/ AVERAGE']); disp([' b1 b2 b3 b4']); disp(num2str(avgb(k+1:2*k))); disp(' '); disp([' t1 t2 t3 t4']); disp(num2str(avgt(k+1:2*k))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4']); disp(num2str(mse(k+1:2*k))); disp(' '); disp(' '); disp(['METHOD #3: DUMMY-VARIABLE APPROACH']); disp([' b1 b2 b3 b4 b5']); disp(num2str(avgb(2*k+1:3*k+1))); disp(' '); disp([' t1 t2 t3 t4 t5']); disp(num2str(avgt(2*k+1:3*k+1))); disp(' '); disp([' MSE1 MSE2 MSE3 MSE4 MSE5']); disp(num2str(mse(2*k+1:3*k+1))); end % ***************************** % Sampling Distributions for b3 % ***************************** if (mc == 1); subplot(2,1,1); hist(bmat(:,3),20); title('Without Multicollinearity'); set(gca,'XLim',[-2,8]); subplot(2,1,2); hist(bmat(:,7),20); title('With Multicollinearity'); set(gca,'XLim',[-2,8]); end if (md == 1); subplot(3,1,1); hist(bmat(:,3),20); set(gca,'XLim',[-2,8]); title('Method #1: Listwise Deletion'); subplot(3,1,2); hist(bmat(:,7),20); set(gca,'XLim',[-2,8]); title('Method #2: Replace w/ Average'); subplot(3,1,3); hist(bmat(:,11),20); set(gca,'XLim',[-2,8]); title('Method #3: Dummy-Variable Approach'); end