You are on page 1of 17

proc gchart data=deba.

candy_sales_summary; vbar3d subcategory; run;

F RE QUE NCY 500

400

300

200

100

0 Ch o c o l a t e Fr u i t Gu m Mi x e d So f t S we e t

Su b c a t e g o r y

proc gchart data=deba.candy_sales_summary; hbar3d subcategory; run;

Su b c a t e g o r y

F R E Q. 491

C U M. F R E Q. 491

PCT . 32. 73

C U M. PCT . 32. 73

Ch o c o l a t e

Fr ui t

170

661

11. 33

44. 07

Gu m

194

855

12. 93

57. 00

Mi x e d

92

947

6. 13

63. 13

So f t

365

1312

24. 33

87. 47

S we e t

188

1500

12. 53

100. 00

100

200

300

400

500

F R E QU E N C Y

proc gchart data=deba.candy_sales_summary; vbar3d subcategory/sum sumvar=Sale_amount; run;

S a l e _ A mo u n t

S UM 2954597. 8

3000000. 0

2000000. 0

1149161. 2 1024456. 2 1042054. 8

1000000. 0

748027. 86

509661. 33

$0. 00 Ch o c o l a t e Fr ui t Gu m Mi x e d So f t S we e t

Su b c a t e g o r y

proc gchart data=deba.candy_sales_summary; vbar3d subcategory/sum sumvar=Sale_amount group=fiscal_year; run;

Sal e_Am ount

SU M

800000. 00

700000. 00

600000. 00

500000. 00

400000. 00

300000. 00

200000. 00

100000. 00

$0. 00 C F GM S h r u i o o u mx f e t c i d o t l a t e 1999 S w e e t C F GMS h r u i o o u mx f e t c i d o t l a t e 2000 S w e e t C F GMS h r u i o o u mx f e t c i d o t l a t e 2001 S w e e t C F GMS h r u i o o u mx f e t c i d o t l a t e 2002 S w e e t C F GM S h r u i o o u mx f e t c i d o t l a t e 2003 S w e e t C F GMS h r u i o o u mx f e t c i d o t l a t e 2004 S w e e t Subc at egor y

Fi s c al

Y ear

proc gchart data=deba.candy_sales_summary; vbar3d subcategory/sum sumvar=Sale_amount group=fiscal_year subgroup=fiscal_quarter; run;

F G M r u i u m x i e t d

S o f t

S w e e t

C h o c o l a t e

F G M r u i u m x i e t d

S o f t

S w e e t

C h o c o l a t e

F G M r u i u m x i e t d

S o f t

S w e e t

C h o c o l a t e

F G M r u i u m x i e t d

S o f t

S w e e t

C h o c o l a t e

F G M r u i u m x i e t d

S o f t

S w e e t

C h o c o l a t e

F G M r u i u m x i e t d

S o f t

S w e e t

Fi s

Q Q Q Q Q

Q Q Q Q

Q Q Q Q

Q Q Q Q

proc gchart data=deba.candy_sales_summary; pie3d subcategory; run;


3

F RE Q E NCY o f

Su b c a t e g o r y

Ch o c o l a t e 491 Fr ui t 170

Gu m 194

Mi x e d 92

S we e t 188

So f t 365

proc gchart data=deba.candy_sales_summary; pie3d subcategory/ discrete value=inside percent=outside slice=inside; run;

)  ( ' 

Fi s

Q Q Q Q

 

   



 

  



 

  



 



  



 

   

  2 1

        

 " " " " " " " " "

& % $ #     !

  

F RE QUE NCY

of

Su b c a t e g o r y

32. 73% 11. 33% Fr u i t 170 Ch o c o l a t e 491

12. 93%

Gu m 194

Mi x e d 92 So f t 365

S we e t 188 12. 53%

6. 13%

24. 33%

proc gchart data=deba.candy_sales_summary; pie3d subcategory/ discrete value=inside percent=outside slice=inside freq=Sale_amount; run;

F R E QU E N C Y

of

Su b c a t e g o r y

39. 78%

Ch o c o l a t e 2954388 13. 79% Fr ui t 1024376 Gu m 747933 S we e t 1149070 So f t 1041879 15. 47%

Mi x e d 509613

10. 07%

6. 86% 14. 03%

pie3d subcategory/ discrete value=inside freq=Sale_amount subgroup=fiscal_year; run;

percent=outside slice=inside

45% 40% Ch o c o l a t e 4 0 %3 6 % 7 287607 Ch o c o l a Ch o c o l a t e t e 7 8Ch2o5c0o l 3a t 9e 2 3 2 10% Fr ui t Fr ui t 2 9 1 1 4 %% 417 % Ch Ch o c o lt6a t 1e 2 9 oc l a e 9 Fr ui t 9 89612 2 88157 F t 3 2 4 0 0 8 r u i t F r u i 7 47583861 4 3 Mi 6 e dt Gu x m 2 7 4 8 4 4 2 6 3S o8f S we e t S we e t Gu m 13% Fr ui t 8 Mi 1 e 1 3 7 5o1 x d S S Gu m 1Mi 1x1e5d9 578898321 0f 2t 7 8 3 1 4 0 3 1 we e t S we e S we e t S we e t 2 t Gu m Gu m 2 1 3 8 7 8 So f t 1 3 3 Mi 4 Gu6m 2 9 1 3 5 1 0x5e4d5 6 3 0 1 3 9 0 S o f t 2 9 1 1 8 5 2 3 4 4 1 3 4 1 9 1 2 4 9 3 8 46 2 13 82169 5796 2 9 So S2 1 6 4 4 1 51f t f t 0o 6 9d x e 6 Mi 1x6e Mi 2 3 d 1 3 1 6 8 4 113% 5% 9 1194 1 5 %1 8 % 95799 32 1 4 7 6 78 1 0 6 8 8 9% 10% 116% 7% 4 % 7 %7 %% 5 %9 7 %7 % 1 5 %% 1 3 % 16 3 12% 27%

Su b c a t e g o r y

Ch o c o l a t e Mi x e d

proc univariate data=deba.cars noprint; histogram Weight; class Country;

run;

6 54

Q E NCY o f

Su b c a t e g o r y

1999 2000 2001 2002 2003 2004

Fr ui t So f t

Gu m S we e t

proc boxplot data=deba.candy_sales_summary; plot Sale_amount*Region; run;

data deba.turbine; informat day date7.; format day date5.; /* label kwatts='Average Power input day @; do i=1 to 10; input kwatts @; output; end; drop i; datalines; 05JUL94 3196 3507 4050 3215 3583 05JUL94 3417 3199 3613 3384 3475 06JUL94 3390 3562 3413 3193 3635 06JUL94 3428 3320 3745 3426 3849 07JUL94 3478 3465 3445 3383 3684 07JUL94 3670 3614 3307 3595 3448 08JUL94 3448 3045 3446 3620 3466 08JUL94 3411 3350 3417 3629 3400 11JUL94 3568 2968 3514 3465 3175 11JUL94 3410 3274 3590 3527 3509 12JUL94 3153 3408 3741 3203 3047 12JUL94 3494 3662 3586 3628 3881 13JUL94 3594 3711 3369 3341 3611 13JUL94 3495 3368 3726 3738 3250

Output';*/

3617 3316 3179 3256 3304 3304 3533 3381 3358 3284 3580 3443 3496 3632

3789 3556 3348 3841 3398 3385 3590 3309 3460 3457 3571 3456 3554 3415

3180 3607 3199 3575 3578 3499 3070 3608 3851 3729 3579 3593 3400 3591

3505 3364 3413 3752 3348 3781 3499 3438 3845 3916 3602 3827 3295 3787

3454 3721 3562 3347 3369 3711 3457 3567 2983 3633 3335 3573 3002 3478

14JUL94 14JUL94 15JUL94 15JUL94 18JUL94 18JUL94 ; run; proc print run;

3482 3330 3152 3206 3421 3296

3546 3465 3269 3140 3381 3501

3196 3994 3431 3562 4040 3366

3379 3362 3438 3592 3467 3492

3559 3309 3575 3722 3475 3367

3235 3781 3476 3421 3285 3619

3549 3211 3115 3471 3619 3550

3445 3550 3146 3621 3325 3263

3413 3637 3731 3361 3317 3355

3859 3626 3171 3370 3472 3510

data=deba.turbine;

proc boxplot data=deba.turbine; plot kwatts*day; run; symbol color = salmon; /*title 'Box Plot for Power Output';*/ proc boxplot data=deba.turbine; plot kwatts*day / cframe = vligb cboxes = dagr cboxfill = ywh; run;
4250

4000

3750

k w a t t s

3500

3250

3000

2750 0 5 J UL 0 6 J UL 0 7 J UL 0 8 J UL 1 1 J UL da 1 2 J UL 1 3 J UL 1 4 J UL 1 5 J UL 1 8 J UL

* The box plot displayed represents summary statistics for the analysis variable kwatts; each of the 10 box-and-whisker plots describes the variable kwatts for a particular day. The plot elements and the statistics they represent are as follows:

proc boxplot data=deba.turbine; plot kwatts*day; inset nobs mean / header = 'Overall Stats'; insetgroup min max / header = 'Stats by day'; run;

4250

4000

3750

k w a t t s

3500

3250

3000

2750 0 5 J UL 0 6 J UL 0 7 J UL 0 8 J UL 1 1 J UL da 1 2 J UL 1 3 J UL 1 4 J UL 1 5 J UL 1 8 J UL

Box Plot

The box plot, also known as a schematic plot, appears beside the stem-and-leaf plot. Both plots use the same vertical scale. The box plot provides a visual summary of the data and

identifies outliers. The bottom and top edges of the box correspond to the sample 25th (Q1) and 75th (Q3) percentiles. The box length is one interquartile range (Q3 - Q1). The center horizontal line with asterisk endpoints corresponds to the sample median. The central plus sign (+) corresponds to the sample mean. If the mean and median are equal, the plus sign falls on the line inside the box. The vertical lines that project out from the box, called whiskers, extend as far as the data extend, up to a distance of 1.5 interquartile ranges. Values farther away are potential outliers. The procedure identifies the extreme values with a zero or an asterisk (*). If zero appears, the value is between 1.5 and 3 interquartile ranges from the top or bottom edge of the box. If an asterisk appears, the value is more extreme.

Stem-and-Leaf Plot
The first plot in the output is either a stem-and-leaf plot or a horizontal bar chart. If any single interval contains more than 49 observations, the horizontal bar chart appears. Otherwise, the stem-and-leaf plot appears. The stem-and-leaf plot is like a horizontal bar chart in that both plots provide a method to visualize the overall distribution of the data. The stem-and-leaf plot provides more detail because each point in the plot represents an individual data value. To change the number of stems that the plot displays, use PLOTSIZE= to increase or decrease the number of rows. Instructions that appear below the plot explain how to determine the values of the variable. If no instructions appear, you multiply Stem.Leaf by 1 to determine the values of the variable. For example, if the stem value is 10 and the leaf value is 1, then the variable value is approximately 10.1. For the stem-and-leaf plot, the procedure rounds a variable value to the nearest leaf. If the variable value is exactly halfway between two leaves, the value rounds to the nearest leaf with an even integer value. For example, a variable value of 3.15 has a stem value of 3 and a leaf value of 2.

Normal Probability Plot


The normal probability plot is a quantile-quantile plot of the data. The procedure plots the empirical quantiles against the quantiles of a standard normal distribution. Asterisks (*) indicate the data values. The plus signs (+) provide a straight reference line that is drawn by using the sample mean and standard deviation. If the data are from a normal distribution, the asterisks tend to fall along the reference line.

Interpreting Quantile-Quantile and Probability Plots


If the data distribution matches the theoretical distribution, the points on the plot form a linear pattern. Thus, you can use a Q-Q plot or a probability plot to determine how well a theoretical distribution models a set of measurements. The following properties of these plots make them useful diagnostics to test how well a specified theoretical distribution fits a set of measurements:
y

If the quantiles of the theoretical and data distributions agree, the plotted points fall on or near the line y=x.

If the theoretical and data distributions differ only in their location or scale, the points on the plot fall on or near the line y=ax+b. The slope a and intercept b are visual estimates of the scale and location parameters of the theoretical distribution.

Q-Q plots are more convenient than probability plots for graphical estimation of the location and scale parameters because the x-axis of a Q-Q plot is scaled linearly. On the other hand, probability plots are more convenient for estimating percentiles or probabilities. There are many reasons why the point pattern in a Q-Q plot may not be linear. Chambers et al. (1983) and Fowlkes (1987) discuss the interpretations of commonly encountered departures from linearity, and these are summarized in the following table. Quantile-Quantile Plot Diagnostics Description of Point Pattern All but a few points fall on a line Left end of pattern is below the line; right end of pattern is above the line Left end of pattern is above the line; right end of pattern is below the line Curved pattern with slope increasing from left to right Curved pattern with slope decreasing from left to right Staircase pattern (plateaus and gaps) Possible Interpretation Outliers in the data Long tails at both ends of the data distribution Short tails at both ends of the distribution Data distribution is skewed to the right Data distribution is skewed to the left Data have been rounded or are discrete

In some applications, a nonlinear pattern may be more revealing than a linear pattern. However as noted by Chambers et al. (1983), departures from linearity can also be due to chance variation.

Using Box Plots to Compare Groups


In the following example, a box plot is used to compare the delay times for airline flights during the Christmas holidays with the delay times prior to the holiday period. The following statements create a data set named Flightdel with the delay times in minutes for 25 flights each day. When a flight is canceled, the delay is recorded as a missing value.
Data deba.flightdel; informat day date7.; format day date7.; input day @@ ; do flight=1 to 25; input delay@@ ; output;

end; datalines; 16DEC88 17DEC88 18DEC88 19DEC88 20DEC88 21DEC88 22DEC88 23DEC88 24DEC88 25DEC88 26DEC88 ; run;

4 0 7 1 1 3 7 2 2 15 5 10 2 1 2 2 9 5 3 19 50 6 6 0 13 8 9 4 24 21 3 2 12

12 14 4 10 5 1 8 7 7 6 6 0 1 11 6 6 2 0 7 21 0 11 63 36 2 3 72 33 6 17 8 8 2

2 3 9 3 7 31 4 0 2 9 5 1 0 . 6 6 4 4 22 1 2 8 35 3 10 4 2 35 17 17 8 2 2

2 . 5 3 1 5 2 1 4 0 14 11 4 1 4 2 2 36 1 3 1 35 3 0 4 27 4 0 0 2 2 10 14

18 2 10 0 7 0 3 10 5 15 7 7 4 0 0 7 5 28 11 43 0 36 12 14 5 2 4 11 8 6 7 16 18

5 3 1 2 2 2 7 20 6 6 7 1 11 23 19 34 22 12 11 5 7 9

6 5 5 2 7 3 1 8 2 5 5 4 39 9 21 9 21 17 10 7 8 5

21 0 0 16 6 12 1 1 2 5 2 7 46 0 . 0 44 22 28 19 2 14

0 6 . 2 11 8 0 14 1 4 5 5 7 17 . 46 66 19 34 9 5 15

0 19 . 1 3 6 2 3 4 2 0 6 33 35 4 0 13 36 3 7 9 1

proc print data=deba.flightdel; run;

In the following statements, the MEANS procedure is used to count the number of canceled flights for each day. This information is then added to the data set Times.
proc means data=deba.flightdel noprint; var delay; by day; output out=deba.Cancel nmiss=ncancel; proc print data=deba.cancel; run; data deba.times; merge deba.flightdel deba.Cancel; by day; run; proc print data=deba.times; run;

The following tate ent eate a data et na ed Weathe that ontain info ation about po ible au e fo dela . Thi data et i e ged with the data et Ti e .
data deba.weather; informat day date7. ; format day date7. ; length reason $ 16 ; input day flight reason & ; datalines; 16DEC88 8 Fog 17DEC88 18 Snow Storm 17DEC88 23 Sleet 21DEC88 24 Rain 21DEC88 25 Rain 22DEC88 7 Mechanical 22DEC88 15 Late Arrival 24DEC88 9 Late Arrival 24DEC88 22 Late Arrival ; run; data deba.times; merge deba.times deba.weather; by day flight; run; proc print data=deba.times; run;

The following tate ent

eate a bo plot fo the o plete et of data.

proc boxplot data=deba.times; plot delay*day = ncancel ; run;

80

60

d e 40 a

The dela di t ibution f o De e be 22 th ough De e be 25 a e d a ti all diffe ent f o the dela di t ibution du ing the p e-holida pe iod. Both the ean dela and the va iabilit of the dela a e u h g eate du ing the holida pe iod.
Proc univariate
proc univariate data=deba.times; var delay; by day; run; *The above program gives day wise univariate statistics of the variable delay; *The o/p of one day is given below
-------------------------------------------- day=16DEC88 ------------------------------------------The UNIVARIATE Procedure Variable: delay Moments N Mean Std Deviation Skewness Uncorrected SS Coeff Variation 24 6.54166667 6.1783575 1.11829334 1905 94.4462294 Sum Weights Sum Observations Variance Kurtosis Corrected SS Std Error Mean 24 157 38.1721014 0.37055638 877.958333 1.26115194

A @9

A @9

A @9

A @9

C
20 0 16 88 17 88 18 88 19 88 2 0 DE C8 8 2 1 DE C8 8 da 2 2 DE C8 8 2 3 DE C8 8 2 4 DE C8 8 2 5 DE C8 8 2 6 DE C8 8

Basic Statistical Measures Location Mean Median Mode 6.541667 5.000000 0.000000 Variability Std Deviation Variance Range Interquartile Range 6.17836 38.17210 21.00000 7.50000

Tests for Location: Mu0=0 Test Student's t Sign Signed Rank -Statistict M S 5.187057 10 105 -----p Value-----Pr > |t| Pr >= |M| Pr >= |S| <.0001 <.0001 <.0001

Quantiles (Definition 5) Quantile 100% Max 99% 95% 90% 75% Q3 50% Median 25% Q1 10% 5% 1% 0% Min Estimate 21.0 21.0 19.0 18.0 9.5 5.0 2.0 0.0 0.0 0.0 0.0

Extreme Observations ----Lowest-------Highest--Value 0 0 0 0 2 Obs 18 11 10 9 15 Value 12 14 18 19 21 Obs 2 12 5 20 8

Missing Values Missing Value . -----Percent Of----Missing All Obs Obs 4.00 100.00

Count 1

proc univariate data=deba.times plot; var delay; by day; run; *The above program gives all the out put as above but adds normality test like normal probability plot,box and whisker plot and stem and leaf plot; *Output for one day is given below for limited space. *Separate box plot for each day is shown here.Look out for o and * for outliers in delay value in all the days;

-------------------------------------------- day=16DEC88 ------------------------------------------The UNIVARIATE Procedure Variable: delay Moments N Mean Std Deviation Skewness Uncorrected SS Coeff Variation 24 6.54166667 6.1783575 1.11829334 1905 94.4462294 Sum Weights Sum Observations Variance Kurtosis Corrected SS Std Error Mean 24 157 38.1721014 0.37055638 877.958333 1.26115194

Basic Statistical Measures Location Mean Median Mode 6.541667 5.000000 0.000000 Variability Std Deviation Variance Range Interquartile Range 6.17836 38.17210 21.00000 7.50000

Tests for Location: Mu0=0 Test Student's t Sign Signed Rank -Statistict M S 5.187057 10 105 -----p Value-----Pr > |t| Pr >= |M| Pr >= |S| <.0001 <.0001 <.0001

Quantiles (Definition 5) Quantile 100% Max 99% 95% 90% 75% Q3 50% Median 25% Q1 10% 5% 1% 0% Min Extreme Observations ----Lowest---Value 0 0 0 0 2 Obs 18 11 10 9 15 ----Highest--Value 12 14 18 19 21 Obs 2 12 5 20 8 Estimate 21.0 21.0 19.0 18.0 9.5 5.0 2.0 0.0 0.0 0.0 0.0

Missing Values Missing Value . -----Percent Of----Missing All Obs Obs 4.00 100.00

Count 1

Stem 20 18 16 14 12 10 8 6 4 2 0

Leaf 0 00 0 0 0 0 000 00000 00000 0000 ----+----+----+----+

# 1 2 1 1 1 1 3 5 5 4

Boxplot 0 | | | | | +-----+ | + | *-----* +-----+ |

Normal Probability Plot 21+ | | | | 11+ | | | | 1+ * * +++ * +++ *+++ +*+ +++* +++*** +*** * * *** * * * *++ +----+----+----+----+----+----+----+----+----+----+ -2 -1 0 +1 +2 * ++++ +++

Schematic Plots
80 + | | | 70 + | | | 60 + | | | 50 + | | | 40 + | | * | 30 + * | * | | 20 + 0 | | | | | | 0 | | | | | 10 + +-----+ | | +-----+ | | | | + | | +-----+ | + | | + | *-----* +--+--+ *--+--* *-----* +-----+ *-----* | +-----+ *-----* +-----+ | | *--+--* +-----+ 0 + | +-----+ | +-----+ +-----+ | ------------+-----------+-----------+-----------+-----------+-----------+----------day 16DEC88 17DEC88 18DEC88 19DEC88 20DEC88 21DEC88

80 + | | | 70 + | | | 60 + | | | 50 + |

0 0 | | | | | | |

| |

| | | | | | | | 40 + | | | | | | | | | +-----+ | | | | | | | | 30 + | | | | | | | | | | | | | | | | | | +-----+ | | +-----+ | 20 + | | | | | | +-----+ | | | | + | | + | | | | | | + | | | | | | | | | | | | | *-----* | + | | 10 + *-----* *-----* | | *-----* +-----+ | | | | | | | | | *--+--* | | | | | +-----+ +-----+ | | | +-----+ +-----+ | | +-----+ 0 + | | | | ------------+-----------+-----------+-----------+-----------+----------day 22DEC88 23DEC88 24DEC88 25DEC88 26DEC88

proc univariate data=deba.times plot normal; var delay; by day; run; *In the above program if you add normal in plot option then everything in the output remains same accept for one more test called test for normality is added; Additional part is shown here
Tests for Normality Test Shapiro-Wilk Kolmogorov-Smirnov Cramer-von Mises Anderson-Darling --Statistic--W D W-Sq A-Sq 0.867044 0.201598 0.189374 1.128692 -----p Value-----Pr Pr Pr Pr < > > > W D W-Sq A-Sq 0.0046 0.0127 0.0067 <0.0050

You might also like