|
64 | 64 | },
|
65 | 65 | {
|
66 | 66 | "cell_type": "code",
|
67 |
| - "execution_count": null, |
| 67 | + "execution_count": 1, |
68 | 68 | "metadata": {
|
69 | 69 | "collapsed": false
|
70 | 70 | },
|
71 |
| - "outputs": [], |
| 71 | + "outputs": [ |
| 72 | + { |
| 73 | + "name": "stdout", |
| 74 | + "output_type": "stream", |
| 75 | + "text": [ |
| 76 | + "/bin/sh: pyspark: command not found\r\n" |
| 77 | + ] |
| 78 | + } |
| 79 | + ], |
72 | 80 | "source": [
|
73 | 81 | "!pyspark"
|
74 | 82 | ]
|
|
82 | 90 | },
|
83 | 91 | {
|
84 | 92 | "cell_type": "code",
|
85 |
| - "execution_count": null, |
| 93 | + "execution_count": 2, |
86 | 94 | "metadata": {
|
87 | 95 | "collapsed": false
|
88 | 96 | },
|
89 |
| - "outputs": [], |
| 97 | + "outputs": [ |
| 98 | + { |
| 99 | + "data": { |
| 100 | + "text/plain": [ |
| 101 | + "<pyspark.context.SparkContext at 0x103923610>" |
| 102 | + ] |
| 103 | + }, |
| 104 | + "execution_count": 2, |
| 105 | + "metadata": {}, |
| 106 | + "output_type": "execute_result" |
| 107 | + } |
| 108 | + ], |
90 | 109 | "source": [
|
91 | 110 | "sc"
|
92 | 111 | ]
|
|
113 | 132 | },
|
114 | 133 | {
|
115 | 134 | "cell_type": "code",
|
116 |
| - "execution_count": null, |
| 135 | + "execution_count": 3, |
117 | 136 | "metadata": {
|
118 | 137 | "collapsed": false
|
119 | 138 | },
|
|
404 | 423 | " print user_id, count, user_info"
|
405 | 424 | ]
|
406 | 425 | },
|
| 426 | + { |
| 427 | + "cell_type": "markdown", |
| 428 | + "metadata": {}, |
| 429 | + "source": [ |
| 430 | + "## DataFrames" |
| 431 | + ] |
| 432 | + }, |
| 433 | + { |
| 434 | + "cell_type": "markdown", |
| 435 | + "metadata": {}, |
| 436 | + "source": [ |
| 437 | + "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. " |
| 438 | + ] |
| 439 | + }, |
| 440 | + { |
| 441 | + "cell_type": "markdown", |
| 442 | + "metadata": {}, |
| 443 | + "source": [ |
| 444 | + "Given the Spark Context, create a SQLContext:" |
| 445 | + ] |
| 446 | + }, |
| 447 | + { |
| 448 | + "cell_type": "code", |
| 449 | + "execution_count": null, |
| 450 | + "metadata": { |
| 451 | + "collapsed": true |
| 452 | + }, |
| 453 | + "outputs": [], |
| 454 | + "source": [ |
| 455 | + "from pyspark.sql import SQLContext\n", |
| 456 | + "sqlContext = SQLContext(sc)" |
| 457 | + ] |
| 458 | + }, |
| 459 | + { |
| 460 | + "cell_type": "markdown", |
| 461 | + "metadata": {}, |
| 462 | + "source": [ |
| 463 | + "Create a dataframe based on the content of a file:" |
| 464 | + ] |
| 465 | + }, |
| 466 | + { |
| 467 | + "cell_type": "code", |
| 468 | + "execution_count": null, |
| 469 | + "metadata": { |
| 470 | + "collapsed": true |
| 471 | + }, |
| 472 | + "outputs": [], |
| 473 | + "source": [ |
| 474 | + "df = sqlContext.jsonFile(\"file:/path/file.json\")" |
| 475 | + ] |
| 476 | + }, |
| 477 | + { |
| 478 | + "cell_type": "markdown", |
| 479 | + "metadata": {}, |
| 480 | + "source": [ |
| 481 | + "Display the content of the DataFrame:" |
| 482 | + ] |
| 483 | + }, |
| 484 | + { |
| 485 | + "cell_type": "code", |
| 486 | + "execution_count": null, |
| 487 | + "metadata": { |
| 488 | + "collapsed": true |
| 489 | + }, |
| 490 | + "outputs": [], |
| 491 | + "source": [ |
| 492 | + "df.show()" |
| 493 | + ] |
| 494 | + }, |
| 495 | + { |
| 496 | + "cell_type": "markdown", |
| 497 | + "metadata": {}, |
| 498 | + "source": [ |
| 499 | + "Print the schema:" |
| 500 | + ] |
| 501 | + }, |
| 502 | + { |
| 503 | + "cell_type": "code", |
| 504 | + "execution_count": null, |
| 505 | + "metadata": { |
| 506 | + "collapsed": true |
| 507 | + }, |
| 508 | + "outputs": [], |
| 509 | + "source": [ |
| 510 | + "df.printSchema()" |
| 511 | + ] |
| 512 | + }, |
| 513 | + { |
| 514 | + "cell_type": "markdown", |
| 515 | + "metadata": {}, |
| 516 | + "source": [ |
| 517 | + "Select a column:" |
| 518 | + ] |
| 519 | + }, |
| 520 | + { |
| 521 | + "cell_type": "code", |
| 522 | + "execution_count": null, |
| 523 | + "metadata": { |
| 524 | + "collapsed": true |
| 525 | + }, |
| 526 | + "outputs": [], |
| 527 | + "source": [ |
| 528 | + "df.select(\"column_name\")" |
| 529 | + ] |
| 530 | + }, |
| 531 | + { |
| 532 | + "cell_type": "markdown", |
| 533 | + "metadata": {}, |
| 534 | + "source": [ |
| 535 | + "Create a DataFrame with rows matching a given filter:" |
| 536 | + ] |
| 537 | + }, |
| 538 | + { |
| 539 | + "cell_type": "code", |
| 540 | + "execution_count": null, |
| 541 | + "metadata": { |
| 542 | + "collapsed": true |
| 543 | + }, |
| 544 | + "outputs": [], |
| 545 | + "source": [ |
| 546 | + "df.filter(df.column_name > 10)" |
| 547 | + ] |
| 548 | + }, |
| 549 | + { |
| 550 | + "cell_type": "markdown", |
| 551 | + "metadata": {}, |
| 552 | + "source": [ |
| 553 | + "Aggregate the results and count:" |
| 554 | + ] |
| 555 | + }, |
| 556 | + { |
| 557 | + "cell_type": "code", |
| 558 | + "execution_count": 5, |
| 559 | + "metadata": { |
| 560 | + "collapsed": false |
| 561 | + }, |
| 562 | + "outputs": [ |
| 563 | + { |
| 564 | + "ename": "NameError", |
| 565 | + "evalue": "name 'df' is not defined", |
| 566 | + "output_type": "error", |
| 567 | + "traceback": [ |
| 568 | + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
| 569 | + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", |
| 570 | + "\u001b[0;32m<ipython-input-5-af17cfa6d2c8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"column_name\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", |
| 571 | + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" |
| 572 | + ] |
| 573 | + } |
| 574 | + ], |
| 575 | + "source": [ |
| 576 | + "df.groupBy(\"column_name\").count()" |
| 577 | + ] |
| 578 | + }, |
| 579 | + { |
| 580 | + "cell_type": "markdown", |
| 581 | + "metadata": {}, |
| 582 | + "source": [ |
| 583 | + "Convert a RDD to a DataFrame (by inferring the schema):" |
| 584 | + ] |
| 585 | + }, |
| 586 | + { |
| 587 | + "cell_type": "code", |
| 588 | + "execution_count": null, |
| 589 | + "metadata": { |
| 590 | + "collapsed": true |
| 591 | + }, |
| 592 | + "outputs": [], |
| 593 | + "source": [ |
| 594 | + "df = sqlContext.inferSchema(my_data)" |
| 595 | + ] |
| 596 | + }, |
| 597 | + { |
| 598 | + "cell_type": "markdown", |
| 599 | + "metadata": {}, |
| 600 | + "source": [ |
| 601 | + "Register the DataFrame as a table:" |
| 602 | + ] |
| 603 | + }, |
| 604 | + { |
| 605 | + "cell_type": "code", |
| 606 | + "execution_count": null, |
| 607 | + "metadata": { |
| 608 | + "collapsed": true |
| 609 | + }, |
| 610 | + "outputs": [], |
| 611 | + "source": [ |
| 612 | + "df.registerTempTable(\"dataframe_name\")" |
| 613 | + ] |
| 614 | + }, |
| 615 | + { |
| 616 | + "cell_type": "markdown", |
| 617 | + "metadata": {}, |
| 618 | + "source": [ |
| 619 | + "Run a SQL Query on a DataFrame registered as a table:" |
| 620 | + ] |
| 621 | + }, |
| 622 | + { |
| 623 | + "cell_type": "code", |
| 624 | + "execution_count": null, |
| 625 | + "metadata": { |
| 626 | + "collapsed": true |
| 627 | + }, |
| 628 | + "outputs": [], |
| 629 | + "source": [ |
| 630 | + "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" |
| 631 | + ] |
| 632 | + }, |
407 | 633 | {
|
408 | 634 | "cell_type": "markdown",
|
409 | 635 | "metadata": {},
|
|
0 commit comments