updates for 2013 CDC analysis

kruggles7 · Sep 25, 2014 · 2be001f · 2be001f
1 parent b0194b9
commit 2be001f
Show file tree

Hide file tree

Showing 14 changed files with 1,294 additions and 18 deletions.
diff --git a/clustergram/OR_clustergram_ques_RC_2013.m b/clustergram/OR_clustergram_ques_RC_2013.m
@@ -0,0 +1,135 @@
+%make heatmaps out of the relative risk matrix 
+
+cd .. 
+cd .. 
+cd matrices
+load OR_2013.mat
+load qlabel_090914.mat
+cd ..
+cd programs
+cd clustergrams
+cd results
+xlab={'2013', '2011', '2009', '2007', '2005', '2003', '2001'}; 
+
+ques=input ('Enter in the question number you want to use (ex. Q01): ', 's'); 
+for i=1:82
+    i_char=num2str(i); 
+    q1_=i_char; 
+    num2=i; 
+    if length(i_char)<2
+        i_char=['0' i_char]; 
+    end 
+    q1=['Q' i_char]; 
+    if strcmp(q1,ques)==1
+        indx=find (strcmp(odds_ratio_cell(:,1),q1)==1); 
+        if isempty(indx)==0
+            lab=odds_ratio_cell(indx,2); 
+            P=odds_ratio_cell(indx,3:9); 
+            P2=cell.empty; 
+            qlabel2=cell.empty; 
+            [rl,cl]=size(qlabel); 
+            counter=1; 
+            for j=1:rl
+                indx=find(strcmp(qlabel{j,2},lab)==1 & strcmp(lab, ques)==0); 
+                if numel(indx)>0 
+                    P2(counter,:)=P(indx,:); 
+                    qlabel2(counter,:)=qlabel(j,:); 
+                    counter=counter+1; 
+                end 
+
+            end 
+            %replace NaN with -10000
+            indx=find(strcmp(P2,'NaN')==1); 
+            for j=1:numel(indx)
+                P2{indx(j)}=-10000; 
+            end 
+            indx=find(strcmp(P2,'Inf')==1); 
+            for j=1:numel(indx)
+                P2{indx(j)}=10000; 
+            end 
+            emptycells=cellfun(@isempty, P2); 
+            [r,c]=size(emptycells); 
+            for j=1:r
+                for k=1:c
+                    if (emptycells(j,k)==1)
+                        P2{j,k}=-10000; 
+                    end
+                end
+            end
+            plot_mat=cell2mat(P2); 
+            [r,c]=size(plot_mat); 
+
+            q2=odds_ratio_cell(indx,2);
+            q2_=cell(length(q2)-1,1); 
+            for p=1:length(q2)-1
+                s=q2{p,1}; 
+                q2_{p,1}=s(2:3); 
+            end  
+            plot_mat=rot90(plot_mat);
+            [r,c]=size(plot_mat);
+            plot_mat(plot_mat==-10000)=NaN; 
+
+            %create second matrix without NaN 
+            plot_mat2=double.empty; 
+            xlab_new=cell.empty; 
+            counter=1; 
+            for j=1:r
+                indx=find(isnan(plot_mat(j,:))==0) ; 
+                if isempty(indx)==0 %entire row is NOT nan
+                    plot_mat2(counter,:)=plot_mat(j,:); 
+                    xlab_new{counter}=xlab{j}; 
+                    counter=counter+1; 
+                end
+            end 
+            %remove questions that don't have all of the same years
+            [r,c]=size(plot_mat2); 
+            plot_mat3=double.empty;
+            qlab_new=cell.empty; 
+            counter=1; 
+            for j=1:c
+                indx=find(isnan(plot_mat2(:,j))==1); 
+                if numel(indx)==0
+                    plot_mat3(:,counter)=plot_mat2(:,j); 
+                    qlab_new{counter}=qlabel2{j,:}; 
+                    counter=counter+1; 
+                end 
+            end 
+
+            %create 3rd matrix without the maximum values 
+            [r,c]=size(plot_mat3); 
+            plot_mat3(isinf(plot_mat3)==1)=-10000 ; 
+            plot_mat3(plot_mat3==10000)=-10000; 
+            for j=1:r
+                maxv=nanmax(plot_mat3(j,:)); 
+                indx_inf=find(plot_mat3(j,:)==-10000); 
+                if numel(indx_inf)>0
+                    plot_mat3(j,indx_inf)=maxv; 
+                end 
+            end
+            %median center
+            for j=1:r
+                med=nanmedian(plot_mat3(j,:)); 
+                plot_mat3(j,:)=plot_mat3(j,:)/med; 
+            end 
+            log_rel_risk=log2(plot_mat3); 
+            for j=1:r
+                temp=log_rel_risk(j,:); 
+                minv=min(temp); 
+                indx3=find(log_rel_risk(j,:)<-100000); 
+                log_rel_risk(j,indx3)=minv; 
+            end 
+            log_rel_risk(isnan(log_rel_risk)==1)=0; 
+%             xlab=fliplr(xlab); 
+            %log_rel_risk=flipud(log_rel_risk); 
+            %qqplot_figs(log_rel_risk, [ q1 '_log']); 
+            %cg=clustergram(log_rel_risk,'Cluster',2, 'Colormap','jet', 'DisplayRange',3, 'Symmetric','true'); 
+
+            cg=clustergram(log_rel_risk,'RowLabels', xlab_new, 'ColumnLabels',qlab_new,'Cluster',2, 'Colormap','jet', 'DisplayRange',3, 'Symmetric','true'); 
+            fig=plot(cg); 
+            %tightfig; 
+            print (gcf,'-dpng',[q1 '_clustermap_OR_RC_2013.png']); 
+            saveas(gcf,[q1 '_clustermap_OR_RC_2013.fig']);
+        end 
+        %close 
+    end 
+end 
diff --git a/heatmaps/create_hm_graph_2013.m b/heatmaps/create_hm_graph_2013.m
@@ -0,0 +1,126 @@
+ function [per_mat_map] = create_hm_graph_2013( question_mat, filename, race, sex, weight )
+% CREATE_HM_GRAPH takes in the question binary matrix (rows= years,
+% columns=subjects) and outputs and outputs the summary matrix with the percentage of "yes" in each subgroup.
+% also makes a heatmap and/or graph out of the output matrix matrix and saves them as PDFs (or whatever other file type you chose)
+%   Input variables: 
+%       QUESTION_MAT: binary matrix for question (row=years, column=subjects) 
+%       FILENAME: name of the question for file saving
+%		HEATMAP: 1 for yes, 0 for no
+%		GRAPH: 1 for yes, 0 for no
+
+
+% TOTAL=importdata('TOTAL.txt', '\t'); 
+% ^ not necessary for this program because we're only interested in the students who answered and didn't leave out the Q
+
+[r,c]=size(question_mat);
+label_=question_mat(:,1); 
+question_mat=question_mat(:,2:c); 
+sex=sex(:,2:c); 
+race=race(:,2:c); 
+weight=weight(:,2:c); 
+[r,c]=size(question_mat);
+
+per_mat = zeros(16,r);
+for i=1:r
+%	total(i)=TOTAL(i,1);
+	index_yes{i}=find(question_mat(i,:)==1);
+	index_girls{i}=find(sex(i,:)==1);
+	index_boys{i}=find(sex(i,:)==2);
+	index_W{i}=find(race(i,:)== 1 );
+	index_B{i}=find(race(i,:)== 2 );
+	index_H{i}=find(race(i,:)== 3 );
+	index_O{i}=find(race(i,:)== 4 );
+	index_missQ{i}=find(question_mat(i,:)==9); %students who didn't answer the Q
+    index_nomiss{i}=find(question_mat(i,:)==0 | question_mat(i,:)==1); %answers that were NOT missing (ie. 0's and 1's / no's and yes's)
+	missQ(i)=length(index_missQ{i}); %number of students who answered the question each year
+    index_total_b{i}=intersect(index_nomiss{i},index_boys{i}); %index of all boys who answered 
+    index_total_g{i}=intersect(index_nomiss{i},index_girls{i}); %index of all girls who answered
+	w=weight(i,:)'; 
+    total_ans(i)=nansum(w(index_nomiss{i})); 
+    total_girls(i)=nansum(w(index_total_g{i})); %total # of girls who answered
+    total_boys(i)=nansum(w(index_total_b{i}));  %total number of boys who answered
+	total_W{i}=nansum(w(intersect(index_nomiss{i}, index_W{i}))); %total # of white students who answered
+	total_B{i}=nansum(w(intersect(index_nomiss{i}, index_B{i}))); %total # of black students who answered
+	total_H{i}=nansum(w(intersect(index_nomiss{i}, index_H{i}))); %total # of hispanic students who answered
+	total_O{i}=nansum(w(intersect(index_nomiss{i}, index_O{i}))); %total # of "other" students who answered
+	total_Wb(i)=nansum(w(intersect(index_total_b{i},index_W{i}))); 
+    total_Wg(i)=nansum(w(intersect(index_total_g{i},index_W{i}))); 
+    total_Bb(i)=nansum(w(intersect(index_total_b{i},index_B{i}))); 
+    total_Bg(i)=nansum(w(intersect(index_total_g{i},index_B{i}))); 
+    total_Hb(i)=nansum(w(intersect(index_total_b{i},index_H{i}))); 
+    total_Hg(i)=nansum(w(intersect(index_total_g{i},index_H{i}))); 
+    total_Ob(i)=nansum(w(intersect(index_total_b{i},index_O{i}))); 
+    total_Og(i)=nansum(w(intersect(index_total_g{i},index_O{i})));  
+
+    w=weight(i,:)'; 
+	index_yesgirls{i}=intersect(index_yes{i},index_girls{i});
+	index_yesboys{i}=intersect(index_yes{i},index_boys{i});
+	yes_girls(i)=nansum(w(index_yesgirls{i}));
+	yes_boys(i)=nansum(w(index_yesboys{i}));
+	yes_W(i)=nansum(w(intersect(index_yes{i}, index_W{i})));
+	yes_B(i)=nansum(w(intersect(index_yes{i}, index_B{i})));
+	yes_H(i)=nansum(w(intersect(index_yes{i}, index_H{i})));
+	yes_O(i)=nansum(w(intersect(index_yes{i}, index_O{i})));
+	yes_WG(i)=nansum(w(intersect(index_yesgirls{i},index_W{i})));
+	yes_BG(i)=nansum(w(intersect(index_yesgirls{i},index_B{i})));
+	yes_HG(i)=nansum(w(intersect(index_yesgirls{i},index_H{i})));
+	yes_OG(i)=nansum(w(intersect(index_yesgirls{i},index_O{i})));
+	yes_WB(i)=nansum(w(intersect(index_yesboys{i},index_W{i})));
+	yes_BB(i)=nansum(w(intersect(index_yesboys{i},index_B{i})));
+	yes_HB(i)=nansum(w(intersect(index_yesboys{i},index_H{i})));
+	yes_OB(i)=nansum(w(intersect(index_yesboys{i},index_O{i})));
+    total_yes(i)=nansum(w(index_yes{i}));
+    total_w(i)=total_W{i}; 
+    total_b(i)=total_B{i}; 
+    total_h(i)=total_H{i}; 
+    total_o(i)=total_O{i};
+	per_mat(15, i)=total_yes(i)/total_ans(i)*100; %total
+	per_mat(14, i)=yes_boys(i)/total_boys(i)*100; %boys
+	per_mat(13, i)=yes_girls(i)/total_girls(i)*100; %girls
+	per_mat(12, i)=yes_W(i)/total_w(i)*100; %whites
+	per_mat(11, i)=yes_B(i)/total_b(i)*100; %blacks
+	per_mat(10, i)=yes_H(i)/total_h(i)*100; %hispanics
+	per_mat(9, i)=yes_O(i)/total_o(i)*100; %other
+	per_mat(8, i)=yes_WB(i)/total_Wb(i)*100; %WB
+	per_mat(7, i)=yes_WG(i)/total_Wg(i)*100; %WG
+	per_mat(6, i)=yes_BB(i)/total_Bb(i)*100; %BB
+	per_mat(5, i)=yes_BG(i)/total_Bg(i)*100; %BG
+	per_mat(4, i)=yes_HB(i)/total_Hb(i)*100; %HB
+	per_mat(3, i)=yes_HG(i)/total_Hg(i)*100; %HG
+	per_mat(2, i)=yes_OB(i)/total_Ob(i)*100; %OB
+	per_mat(1, i)=yes_OG(i)/total_Og(i)*100; %OG
+end
+
+%Make heatmap
+
+
+	label_year=num2cell(label_); 
+	label_cell2={'Total', 'Boys', 'Girls', 'W', 'B', 'H', 'O', 'W Boys', 'W Girls', 'B Boys', 'B Girls', 'H Boys', 'H Girls', 'O Boys', 'O Girls'}; 
+	per_mat_map(1:15,1:r)=per_mat(1:15,1:r);
+	per_mat_map=flipdim(per_mat_map,1);
+    max_mat=max(max(per_mat_map)); 
+    if max_mat>75
+        M=100; 
+    elseif max_mat>50
+        M=75; 
+    elseif max_mat>25
+        M=50; 
+    else 
+        M=25; 
+    end 
+    %get rid of deimals
+    per_mat_map=per_mat_map*10; 
+    per_mat_map=round(per_mat_map); 
+    per_mat_map=per_mat_map/10; 
+	h=figure; 
+	[hImage]=heatmap_rb(per_mat_map, label_year, label_cell2, 1, M, 0, 'Colormap','money', 'UseLogColormap', false, 'ShowAllTicks',true, 'Colorbar',true,'TextColor','k', 'FontSize', 12); 
+    %title (title1, 'FontSize', 12); 
+	set (gca, 'FontSize',12); 
+    cd results
+	saveas (gcf, [ filename '_heatmap_2013.fig'] ); %can make pdf, jnp, or jpg 
+    print (gcf, '-dpng',  [ filename '_heatmap_2013.png']); 
+    cd ..
+    close all
+
+
+ end  %end of function
diff --git a/heatmaps/run_all_hm_graph_2013.m b/heatmaps/run_all_hm_graph_2013.m
@@ -0,0 +1,37 @@
+heatmap=1;
+graph=0;
+
+files1=dir(fullfile('C:','Users','kruggles7','Dropbox (Personal)','CDC','data','results_091614','NaN', '*.txt'));
+
+% reads all the text files in the folder 'binary_NaN_files' and saves them in an array called files
+% make sure that folder contains only the NaN files for the questions you want to run create_hm_graph for
+N=length(files1);
+
+for i=1:N
+    cd ..
+    cd ..
+    cd data
+    cd Controls_061514
+    sex=importdata('sex-NaN.txt', '\t');
+    race=importdata('race-NaN.txt', '\t'); 
+    weight=importdata('weights-NaN.txt','\t'); 
+    cd ..
+    cd results_091614
+    cd NaN
+    question_mat=importdata(files1(i).name, '\t');
+	%%%
+    cd .. 
+    cd ..
+    cd ..
+    cd programs
+    cd heatmaps
+	filename='';
+	a=char(files1(i).name);
+	b=strfind(a,'-');
+	for p=1:(b(1)-1)
+		c=a(p);
+		filename=[filename c];
+	end
+	%%% ^ this piece of the program just makes the variable 'filename' out of everything before the '-' in 'Q#--NaN'
+	[ per_mat_map ] = create_hm_graph_2013 ( question_mat, filename, race, sex, weight );
+end
diff --git a/odds_ratio/create_RR_OR_2013.m b/odds_ratio/create_RR_OR_2013.m
@@ -12,14 +12,14 @@
 
 k=1; %counter for rows in rel_risk_cell and odds_ratio_cell
 
-files1=dir(fullfile('C:','Users','kruggles7','Documents','MATLAB', 'CDC', 'data','results_061514','NaN', '*.txt'));
+files1=dir(fullfile('C:','Users','rugglk01','Dropbox (Personal)','CDC','data','results_091614','NaN', '*.txt'));
 cd ..
 cd ..
 cd matrices
 load reverse_code.mat
 cd ..
 cd data
-cd results_061514
+cd results_091614
 cd NaN
 
 P=length(files1);
@@ -43,7 +43,7 @@
     q1_RC=reverse_code(ct,1); 
 
     %filename2
-    files2=dir(fullfile('C:','Users','kruggles7','Documents','MATLAB', 'Rajan', 'relative_risk_final','NaN_results_010314', '*.txt'));
+   files2=dir(fullfile('C:','Users','rugglk01','Dropbox (Personal)','CDC','data','results_091614','NaN', '*.txt'));
 	N=length(files2);
 	for n=1:N
 		quest_2=importdata(files2(n).name, '\t');
@@ -84,8 +84,8 @@
                 indx2=find(year2==y); 
                 if numel(indx1)>0 && numel(indx2)>0 %both in the matrix
                     year_final(counter,1)=y; 
-                    quest_1F(counter,:)=quest_1(indx1,:); 
-                    quest_2F(counter,:)=quest_2(indx2,:); 
+                    quest_1F(counter,:)=quest_1(indx1,2:c1); 
+                    quest_2F(counter,:)=quest_2(indx2,2:c2); 
                     e=ii; 
                     if (counter==1)
                         s=ii; 
@@ -141,22 +141,22 @@
                 c=total_yes1_no2(i);
                 b=total_no1_yes2(i);
 
-				%formula for relative risk:
-                RR= ( a/(a+b) ) / ( c/(c+d) );
-
-                P1= a/(a+b);
-                P2= c/(c+d);
+% 				%formula for relative risk:
+%                 RR= ( a/(a+b) ) / ( c/(c+d) );
+% 
+%                 P1= a/(a+b);
+%                 P2= c/(c+d);
 
 				%formula for odds ratio:
-				OR= ( P1/(1-P1) ) / ( P2/(1-P2) );
-
-
-				%rel_risk cell matrix:
-				rel_risk_cell{k,1}=[filename1];
-				rel_risk_cell{k,2}=[filename2];
-				x=num2cell(RR);
-				rel_risk_cell(k,K)=x;
+				OR= (a*d)/(b*c);
 
+% 				
+% 				%rel_risk cell matrix:
+% 				rel_risk_cell{k,1}=[filename1];
+% 				rel_risk_cell{k,2}=[filename2];
+% 				x=num2cell(RR);
+% 				rel_risk_cell(k,K)=x;
+% 				
 				%odds_ratio cell matrix:
 				odds_ratio_cell{k,1}=[filename1];
 				odds_ratio_cell{k,2}=[filename2];