Giriş

Veri Bilimi Nedir?

  • Disiplinler arası bir alan
  • Programlama
  • İstatistik
  • Alan uzmanlığı - analiz

— .class #id

Veri Bilimi Nedir?

Veri Bilimi Venn Şeması

— .class #id

New York Uçuş Verisi

     library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, last
     flights = fread("flights.csv")
     airports = fread("airports.csv")
     planes = fread("planes.csv")
     weather = fread("weather.csv")
## Warning in fread("weather.csv"): Bumped column 14 to type character on
## data row 73, field contains '""'. Coercing previously read values in this
## column from logical, integer or numeric back to character which may not
## be lossless; e.g., if '00' and '000' occurred before they will now be just
## '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too
## (if they occurred in this column before the bump). If this matters please
## rerun and set 'colClasses' to 'character' for this column. Please note
## that column type detection uses the first 5 rows, the middle 5 rows and the
## last 5 rows, so hopefully this message should be very rare. If reporting to
## datatable-help, please rerun and include the output from verbose=TRUE.

— .class #id

Verileri Gözden Geçirme

     str(flights)
## Classes 'data.table' and 'data.frame':   227496 obs. of  14 variables:
##  $ date     : chr  "2011-01-01 12:00:00" "2011-01-02 12:00:00" "2011-01-03 12:00:00" "2011-01-04 12:00:00" ...
##  $ hour     : int  14 14 13 14 14 13 13 13 14 14 ...
##  $ minute   : int  0 1 52 3 5 59 59 55 43 43 ...
##  $ dep      : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ arr      : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ dep_delay: int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ arr_delay: int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ carrier  : chr  "AA" "AA" "AA" "AA" ...
##  $ flight   : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ dest     : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ plane    : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ cancelled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ time     : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ dist     : int  224 224 224 224 224 224 224 224 224 224 ...
##  - attr(*, ".internal.selfref")=<externalptr>

— .class #id

Verileri Gözden Geçirme (Alt.)

     library(dplyr)
     tbl_df(flights)
## # A tibble: 227,496 x 14
##                   date  hour minute   dep   arr dep_delay arr_delay
##                  <chr> <int>  <int> <int> <int>     <int>     <int>
## 1  2011-01-01 12:00:00    14      0  1400  1500         0       -10
## 2  2011-01-02 12:00:00    14      1  1401  1501         1        -9
## 3  2011-01-03 12:00:00    13     52  1352  1502        -8        -8
## 4  2011-01-04 12:00:00    14      3  1403  1513         3         3
## 5  2011-01-05 12:00:00    14      5  1405  1507         5        -3
## 6  2011-01-06 12:00:00    13     59  1359  1503        -1        -7
## 7  2011-01-07 12:00:00    13     59  1359  1509        -1        -1
## 8  2011-01-08 12:00:00    13     55  1355  1454        -5       -16
## 9  2011-01-09 12:00:00    14     43  1443  1554        43        44
## 10 2011-01-10 12:00:00    14     43  1443  1553        43        43
## # ... with 227,486 more rows, and 7 more variables: carrier <chr>,
## #   flight <int>, dest <chr>, plane <chr>, cancelled <int>, time <int>,
## #   dist <int>

— .class #id

Diğer Veriler

     tbl_df(weather)
## # A tibble: 8,723 x 14
##          date  hour  temp dew_point humidity pressure visibility wind_dir
##         <chr> <int> <dbl>     <dbl>    <int>    <dbl>      <dbl>    <chr>
## 1  2011-01-01     0  59.0      28.9       32    29.86         10      NNE
## 2  2011-01-01     1  57.2      28.4       33    29.88         10      NNE
## 3  2011-01-01     2  55.4      28.4       36    29.93         10      NNW
## 4  2011-01-01     3  53.6      28.4       38    29.94         10    North
## 5  2011-01-01     4    NA        NA       NA    29.99         10      NNW
## 6  2011-01-01     5    NA        NA       NA    30.02         10    North
## 7  2011-01-01     6  53.1      17.1       24    30.05         10    North
## 8  2011-01-01     7  53.1      16.0       23    30.07         10    North
## 9  2011-01-01     8  54.0      18.0       24    30.09         10    North
## 10 2011-01-01     9  55.4      17.6       23    30.09         10      NNE
## # ... with 8,713 more rows, and 6 more variables: wind_dir2 <int>,
## #   wind_speed <dbl>, gust_speed <dbl>, precip <dbl>, conditions <chr>,
## #   events <chr>

— .class #id

Diğer Veriler

     tbl_df(planes)
## # A tibble: 2,853 x 9
##     plane  year               mfr          model no.eng no.seats speed
##     <chr> <int>             <chr>          <chr>  <int>    <int> <int>
## 1  N576AA  1991 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 2  N557AA  1993        MARZ BARRY      KITFOX IV      1        2    NA
## 3  N403AA  1974             RAVEN           S55A     NA        1    60
## 4  N492AA  1989 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 5  N262AA  1985 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 6  N493AA  1989 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 7  N477AA  1988 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 8  N476AA  1988 MCDONNELL DOUGLAS DC-9-82(MD-82)      2      172    NA
## 9  N504AA    NA AUTHIER ANTHONY P      TIERRA II      1        2    NA
## 10 N565AA  1987 MCDONNELL DOUGLAS DC-9-83(MD-83)      2      172    NA
## # ... with 2,843 more rows, and 2 more variables: engine <chr>, type <chr>

— .class #id

Diğer Veriler

     tbl_df(airports)
## # A tibble: 3,376 x 7
##     iata              airport             city state country      lat
##    <chr>                <chr>            <chr> <chr>   <chr>    <dbl>
## 1    00M             Thigpen       Bay Springs    MS     USA 31.95376
## 2    00R Livingston Municipal       Livingston    TX     USA 30.68586
## 3    00V          Meadow Lake Colorado Springs    CO     USA 38.94575
## 4    01G         Perry-Warsaw            Perry    NY     USA 42.74135
## 5    01J     Hilliard Airpark         Hilliard    FL     USA 30.68801
## 6    01M    Tishomingo County          Belmont    MS     USA 34.49167
## 7    02A          Gragg-Wade           Clanton    AL     USA 32.85049
## 8    02C              Capitol       Brookfield    WI     USA 43.08751
## 9    02G    Columbiana County   East Liverpool    OH     USA 40.67331
## 10   03D     Memphis Memorial          Memphis    MO     USA 40.44726
## # ... with 3,366 more rows, and 1 more variables: long <dbl>

— .class #id

Row Index (Satır No)

     flights[3]
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-01-03 12:00:00   13     52 1352 1502        -8        -8      AA
##    flight dest  plane cancelled time dist
## 1:    428  DFW N541AA         0   48  224
     flights[5]
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-01-05 12:00:00   14      5 1405 1507         5        -3      AA
##    flight dest  plane cancelled time dist
## 1:    428  DFW N492AA         0   44  224

— .class #id

Head (Baş Kısım)

     head(flights)
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-01-01 12:00:00   14      0 1400 1500         0       -10      AA
## 2: 2011-01-02 12:00:00   14      1 1401 1501         1        -9      AA
## 3: 2011-01-03 12:00:00   13     52 1352 1502        -8        -8      AA
## 4: 2011-01-04 12:00:00   14      3 1403 1513         3         3      AA
## 5: 2011-01-05 12:00:00   14      5 1405 1507         5        -3      AA
## 6: 2011-01-06 12:00:00   13     59 1359 1503        -1        -7      AA
##    flight dest  plane cancelled time dist
## 1:    428  DFW N576AA         0   40  224
## 2:    428  DFW N557AA         0   45  224
## 3:    428  DFW N541AA         0   48  224
## 4:    428  DFW N403AA         0   39  224
## 5:    428  DFW N492AA         0   44  224
## 6:    428  DFW N262AA         0   45  224

— .class #id

Head (Baş Kısım) (Alt.)

     flights[1:5]
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-01-01 12:00:00   14      0 1400 1500         0       -10      AA
## 2: 2011-01-02 12:00:00   14      1 1401 1501         1        -9      AA
## 3: 2011-01-03 12:00:00   13     52 1352 1502        -8        -8      AA
## 4: 2011-01-04 12:00:00   14      3 1403 1513         3         3      AA
## 5: 2011-01-05 12:00:00   14      5 1405 1507         5        -3      AA
##    flight dest  plane cancelled time dist
## 1:    428  DFW N576AA         0   40  224
## 2:    428  DFW N557AA         0   45  224
## 3:    428  DFW N541AA         0   48  224
## 4:    428  DFW N403AA         0   39  224
## 5:    428  DFW N492AA         0   44  224

— .class #id

Range (Aralık)

     flights[30:32]
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-01-30 12:00:00   13     59 1359 1456        -1       -14      AA
## 2: 2011-01-31 12:00:00   14     41 1441 1553        41        43      AA
## 3: 2011-01-01 12:00:00    7     28  728  840         8         5      AA
##    flight dest  plane cancelled time dist
## 1:    428  DFW N561AA         0   39  224
## 2:    428  DFW N505AA         0   39  224
## 3:    460  DFW N520AA         0   41  224

— .class #id

Tail (Son Kısım)

     tail(flights)
##                   date hour minute  dep  arr dep_delay arr_delay carrier
## 1: 2011-12-06 12:00:00   13      7 1307 1600         7         0      WN
## 2: 2011-12-06 12:00:00   18     18 1818 2111         8        -9      WN
## 3: 2011-12-06 12:00:00   20     47 2047 2334         7         4      WN
## 4: 2011-12-06 12:00:00    9     12  912 1031        -3        -4      WN
## 5: 2011-12-06 12:00:00    6     56  656  812        -4       -13      WN
## 6: 2011-12-06 12:00:00   16      0 1600 1713         0       -12      WN
##    flight dest  plane cancelled time dist
## 1:    471  TPA N632SW         0   98  781
## 2:   1191  TPA N284WN         0   97  781
## 3:   1674  TPA N366SW         0   94  781
## 4:    127  TUL N777QC         0   61  453
## 5:    621  TUL N727SW         0   64  453
## 6:   1597  TUL N745SW         0   59  453

— .class #id

RStudio

    setwd("/Users/mertnuhoglu/projects/dewey/r_verigazeteciligi")
    library(data.table)
    flights = fread("flights.csv")
    View(flights)

— .class #id

RStudio