*log using "C:\Users\Siddharth\Documents\phd\pete\Rawls\stata_copy\stata\CFE Labor\2005\IND_05_sumstats_log.log",replace

********************************************************
**PROGRAM 1: GIVES SUMMARY STATS FOR INDIA 2004-05	  **
********************************************************

*Open data set generated by the do-file 'ind_2004_gendata.do'. The do-file is appended below *
*cd "C:\Users\Rui\Dropbox\Beyond_GDP\IND"
cd "/Users/xurui/Dropbox/Beyond_GDP/IND"
use "CFE Labor/2005/ind_05.dta", clear
*use "C:\Users\Siddharth\Documents\phd\pete\Rawls\stata_copy\stata\61\Rawls\final\ind_05.dta", clear
* use "C:\Users\klenow\Documents\Rawls\IND\ind_04.dta", clear

lab var leisure "Leisure hours in a month (normalized between 0 and 1)"
* age starts with 0, so add 1 to all ages
replace age=age+1
*replace age=100 if age>=100
drop if age > 100
drop if age==.

ren weight weight_temp
egen weight_total=total(weight_temp)
gen weight = weight_temp/weight_total
drop weight_temp weight_total

svyset [pweight=mult_l]
*svymean hhsize age hhexp hhinc leisure
svy: mean hhsize age exp leisure


describe
summarize
*list

* Produces outsheets for Chad*


drop pcode mult_l
ren srl_n_m pcode
ren exp hhexp
keep hhid hhsize age hhexp leisure weight
order hhid hhsize age hhexp leisure weight
format hhid %20.0f
save "IND_05.dta",replace
format weight %11.4e
outfile using "IND_05.txt", replace wide
*sort hhid pcode
*outsheet hhid pcode age hhexp leisure weight using IND_04.csv , comma replace
*log close

exit



*Note: The following is not part of PROGRAM 1.  This is included only
* to provide background on how household expenditure and leisure was calculated.

#delimit ;
clear;
set mem 500m;
*cd "C:\Users\Rui\Dropbox\Beyond_GDP/rawls/61";
cd "/Users/xurui/Dropbox/Beyond_GDP/rawls/61";
*log using "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\rawls\final\rnd_61.smcl", replace;


*********** GENERATING LEISURE AND CONSUMPTION ******************;

*********** Rural&Urban Data - 61st round - 2004-05 *****************;


 *use "10/data/master_11.dta", clear;
 *use "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\10\data\master_11.dta", clear;


***********EXPENDITURE USING SCHEDULE 10 LEVEL 11 *************************;
 use "10/data/master_11.dta", clear;
* use "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\10\data\master_11.dta", clear;
* Data is at 'household-serial number of consumption good' level. Every households expenditure on broad categories of
 consumption is included. This is distinct from Schedule 1 information which we don't need to use necessarily for 
this round. *;


* 124620 unique households;

* Serial number 1 to 22 are recorded at 30 day consumption levels while serial no 23 is supposed to be the sum of all
serial no's from 1 to 22 by household. What should be in 23 is derived independently as there seemed to be serial no.
 23 missing in some hh's which had non-zero consumption *;

egen exp_30dy=sum(cons_30) if serial_no<23,by(hhid);
egen all_30=max(exp_30dy),by(hhid);
* 14 househols (55 obs) don't seem to be spending anything on itemcodes less than 23. Looks fishy. Might want to drop these
instead of repllacing by 0 as is done. Doesn't make sense for these hh's to only be spending on schooling or medical but
not be eating *;
replace all_30=0 if all_30==.;


* Serial no 24 to 26 include medical expenses, exp on education, clothing and footwear. These were recorded at 365 day
consumption levels. The exp on these is summed up here and then rescaled to 30 days.;

egen exp_365dy_nondur=sum(cons_365) if serial_no>23&serial_no<29,by(hhid);
egen non_dur_365=max(exp_365dy_nondur),by(hhid);
* 5046 missing values. The hh's associated with these missing values spent nothing on these serial numbers. *;
replace non_dur_365=0 if non_dur_365==.;
gen non_dur_365_scaled=non_dur_365*30/365;

gen tot_exp=all_30+non_dur_365_scaled;

* Collapsing out serial numbers so that we are just left with household level data on expenditure without durables *;

collapse (mean)  tot_exp=tot_exp,by(hhid);

drop if tot_exp==0;
* 6 households were dropped. These had no data on consumption in the master file *;

sort hhid;
ren tot_exp exp;
lab var exp "Total exp of household excluding durables (30 days)";
save "Rawls/final/india_61_exp.dta",replace;
*save "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\Rawls\final\india_61_exp.dta",replace;

clear;




***********LEISURE USING SCHEDULE 10 *************************;
#delimit ;
use "10/data/master_6.dta", clear;
*use "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\10\data\master_6.dta", clear;



******* Generating unique identifier *******;
gen double u_id=hhid*1000+ srl_n_m*10+ srl_n_a;
lab var u_id "Unique Identifier for HH-Individual-Activity data";
* duplicates drop u_id, force;
* 0 observations dropped *;

* Also, it was checked to see if the intensity days of each individual added up to 7 for the entire week. No problems were
found.*;


* The only issue found with the data was that 23 observations had age missing. Their corresponding daily activity status
was 99. As 99 was only assigned to people under the age of 4, these individuals are also being treated as being less than 
4 years of age. Replacing there age by 2 *;

replace age=2 if age==.;

******* Generating household size (check from other data source)*******;

egen hhsz=max(srl_n_m),by(hhid);


******* Generating Hours Worked Using Time Disposition Data************;


*** Assumption on what contributes towards hours worked. ***;

* In the base case, everyone with activity code less than 60 and equal to 93 is considered working. This corresponds to the
scalar 'employment_rule' taking value 1. If employment_rule takes value 2, then 93 is not considered to be working. *;

scalar employment_rule=2;

gen working=0;
lab var working "Those who are employed";

if employment_rule==1 {;
* Assign value one to the variable 'working' for all those deemed to be working *;
replace working=1 if da_status<60|da_status==93;
};
else {;
replace working=1 if da_status<60;
};



* The hours worked of those considered working is calculated. This is based on number of days spent working (in intensity
 terms) in each activity and the assumption made below about hours constituting a full and half intensity day. *;

* Assumption on how many hours constitute full intensity. *;
scalar full_day_hours=8;

*Assumption on how many hours constitute a half intensity day. *;
scalar half_day_hours=2.5;

* The data here is at the daily level with a`i' representing  intensity of activity in day 'i'. The 'hrs_wrk_i' variable
 which is generated represents hours which count towards hours worked for a particular day. *;

forvalues i=1/7{;
gen full_day_`i'=1*working if a`i'==10;
gen half_day_`i'=1*working/2 if a`i'==5;

*gen hrs_wrk_`i'=0;

*replace hrs_wrk_`i'=working*full_day if a`i'==10;

*replace hrs_wrk_`i'=working*half_day if a`i'==5;

};

/* 8 hours for full day, 2.5 hours for half day, maximum 5 full days */
egen full_days=rsum(full_day_*);
egen half_days=rsum(half_day_*);

egen full_days_total=total(full_days),by(hhid srl_n_m);
egen half_days_total=total(half_days),by(hhid srl_n_m);

egen total_days_worked=rsum(full_days_total half_days_total);

gen hrs_week=0;


replace hrs_week=full_days_total*full_day_hours + half_days_total*2*half_day_hours 												       if total_days_worked<=5;
replace hrs_week=5*full_day_hours 			  + (full_days_total-5)*(full_day_hours/2) 	+ half_days_total*2*(half_day_hours/2) 		   if total_days_worked>5&full_days_total>=5;
replace hrs_week=4*full_day_hours 			  + (5-4)*2*(half_day_hours) 	  			    + (half_days_total-(5-4))*2*(half_day_hours/2)   if total_days_worked>5&full_days_total==4;
replace hrs_week=3*full_day_hours 		      + (5-3)*2*(half_day_hours) 				    + (half_days_total-(5-3))*2*(half_day_hours/2)   if total_days_worked>5&full_days_total==3;
replace hrs_week=2*full_day_hours 		      + (5-2)*2*(half_day_hours) 		 		    + (half_days_total-(5-2))*2*(half_day_hours/2)   if total_days_worked>5&full_days_total==2;
replace hrs_week=1*full_day_hours 		      + (5-1)*2*(half_day_hours) 				    + (half_days_total-(5-1))*2*(half_day_hours/2)   if total_days_worked>5&full_days_total==1;
replace hrs_week=0*full_day_hours 		      + (5-0)*2*(half_day_hours) 				    + (half_days_total-(5-0))*2*(half_day_hours/2)   if total_days_worked>5&full_days_total==0;

/*
gen hrs_week=hrs_wrk_1+hrs_wrk_2+hrs_wrk_3+hrs_wrk_4+hrs_wrk_5+hrs_wrk_6+hrs_wrk_7;

drop  hrs_wrk_1 hrs_wrk_2 hrs_wrk_3 hrs_wrk_4 hrs_wrk_5 hrs_wrk_6 hrs_wrk_7;
*/
* Generating hours worked in a month *;

gen hrs_month=hrs_week*52/12;


******* Cleaning Up *******;

order u_id hhid mult_comb srl_n_m srl_n_a da_status tot_day hrs_week hrs_month working age hhsz;
keep u_id hhid mult_comb srl_n_m srl_n_a hrs_week hrs_month age hhsz;
ren mult_comb mult_l;
sort hhid srl_n_m srl_n_a;



******* Collapse out activity level. Will reduce data to HH-individual level *******;

collapse (mean)   mult_l= mult_l age=age  hhsz=hhsz hrs_month=hrs_month, by ( hhid  srl_n_m);

gen leisure=(5840/12-hrs_month)/(5840/12);
lab var hrs_month "Leisure hours in a month (normalized)";

sort hhid srl_n_m;
merge hhid using "Rawls/final/india_61_exp.dta";
*merge hhid using "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\Rawls\final\india_61_exp.dta";
drop if _merge!=3;
* 315 observations were present in the hours worked data but not in the consupmtion data.*;

ren hhsz hhsize;

su mult_l;
gen tot=r(sum);
gen weight=mult_l/tot;
lab var weight "Survey Weights normalized to sum to 1";
drop tot;
drop _merge;

lab var hhsize "Household Size";
lab var hrs_month "Hours worked in a month";
lab var mult_l "Mulipliers from Schedule 10 (employmeny-unemployment)";

******* Generating pcode - takes value 1 for the eldest person in the household and 0 otherwise ******;

sort hhid age;
egen highest_age=max(age),by(hhid);
gen eldest=1 if highest_age==age;
gen pcode=1 if eldest==1;
* At this stage, pcode can take value 1 for more than one person in each household if there is more than one 
person in the house who has the same highest age;
by hhid:gen eldest_1=sum(eldest);
replace pcode=0 if eldest_1>1|pcode==.;
* Now pcode takes value 1 only for 1 person in each household. ;
* there are 78 households in which the eldest person is less than 10 yrs old. 3 household in which eldest person is
less than 5 years old;
drop  highest_age eldest eldest_1;

lab var pcode "Takes 1 for eldest member of household and 0 otherwise";

*save "Rawls/final/ind_05.dta", replace;
save "/Users/xurui/Dropbox/Beyond_GDP/IND/CFE Labor/2005/ind_05.dta",replace;
*save "C:\Documents and Settings\Sid\My Documents\phd\pete\Rawls\stata_copy\stata\61\Rawls\final\ind_05.dta", replace;

*log close;
