>>> from pyspark.sql.types import Row >>> from datetime imort datetime
File "", line 1
from datetime imort datetime
^
SyntaxError: invalid syntax
>>> from datetime import datetime
>>>
>>>
>>> simple_date=sc.parallelize([1,"Alice",50])
>>> simple_date=sc.parallelize(1,"Alice",50)
Traceback (most recent call last):
File "", line 1, in
TypeError: parallelize() takes from 2 to 3 positional arguments but 4 were given
>>> simple_date=sc.parallelize([1,"Alice",50])
>>> simple_date
ParallelCollectionRDD[1] at parallelize at PythonRDD.scala:175
>>> simple_data
Traceback (most recent call last):
File "", line 1, in
NameError: name 'simple_data' is not defined
>>> simple_data=sc.parallelize([1,"Alice",50])
>>> simple_data
ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175
>>> simple_date.count()
[Stage 0:> (0[Stage 0:> (0 3
>>> simple_data.count()
3
>>> simple_date.first()
1
>>> simple_date.first()
1
>>> simple_date.collect()
[1, 'Alice', 50]
>>> records=sc.parallelize([[1,"Alice",50],[2,"Bob",80]])
>>> records
ParallelCollectionRDD[11] at parallelize at PythonRDD.scala:175
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.count()
2
>>> records.first()
[1, 'Alice', 50]
>>> records.take(2)
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.take(3)
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.take(0)
[]
>>> records.take()
Traceback (most recent call last):
File "", line 1, in
TypeError: take() missing 1 required positional argument: 'num'
>>> records.take(1)
[[1, 'Alice', 50]]
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> df=records.toDF()
>>> df1=records.toDF("ID","Name","Age")
Traceback (most recent call last):
File "", line 1, in
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> df.show()
+---+-----+---+
| _1| _2| _3|
+---+-----+---+
| 1|Alice| 50|
| 2| Bob| 80|
+---+-----+---+
>>> df1=records.toDF("ID,Name",Age")
File "", line 1
df1=records.toDF("ID,Name",Age")
^
SyntaxError: EOL while scanning string literal
>>> df1=records.toDF("ID,Name,Age")
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType.
: org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input ',' expecting(line 1, pos 2)
== SQL ==
ID,Name,Age
--^^^
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:239)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:115)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parseDataType(ParseDriver.scala:39)
at org.apache.spark.sql.api.python.PythonSQLUtils$.parseDataType(PythonSQLUtils.scala:29)
at org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(PythonSQLUtils.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 846, in _parse_datatype_string
return from_ddl_datatype(s)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 838, in from_ddl_datatype
sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nmismatched input ',' expecting(line 1, pos 2)\n\n== SQL ==\nID,Name,Age\n--^^^\n"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType.
: org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input ',' expecting ':'(line 1, pos 9)
== SQL ==
struct
---------^^^
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:239)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:115)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parseDataType(ParseDriver.scala:39)
at org.apache.spark.sql.api.python.PythonSQLUtils$.parseDataType(PythonSQLUtils.scala:29)
at org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(PythonSQLUtils.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 850, in _parse_datatype_string
return from_ddl_datatype("struct<%s>" % s.strip())
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 838, in from_ddl_datatype
sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nmismatched input ',' expecting ':'(line 1, pos 9)\n\n== SQL ==\nstruct\n---------^^^\n"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "/usr/hdp/current/spark2-client/python/pyspark/sql/session.py", line 58, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/session.py", line 638, in createDataFrame
schema = _parse_datatype_string(schema)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 852, in _parse_datatype_string
raise e
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 842, in _parse_datatype_string
return from_ddl_schema(s)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 834, in from_ddl_schema
sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nextraneous input ',' expecting {'SELECT', 'FROM', 'ADD', 'AS', 'ALL', 'DISTINCT', 'WHERE', 'GROUP', 'BY', 'GROUPING', 'SETS', 'CUBE', 'ROLLUP', 'ORDER', 'HAVING', 'LIMIT', 'AT', 'OR', 'AND', 'IN', NOT, 'NO', 'EXISTS', 'BETWEEN', 'LIKE', RLIKE, 'IS', 'NULL', 'TRUE', 'FALSE', 'NULLS', 'ASC', 'DESC', 'FOR', 'INTERVAL', 'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'JOIN', 'CROSS', 'OUTER', 'INNER', 'LEFT', 'SEMI', 'RIGHT', 'FULL', 'NATURAL', 'ON', 'LATERAL', 'WINDOW', 'OVER', 'PARTITION', 'RANGE', 'ROWS', 'UNBOUNDED', 'PRECEDING', 'FOLLOWING', 'CURRENT', 'FIRST', 'AFTER', 'LAST', 'ROW', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'DIRECTORY', 'VIEW', 'REPLACE', 'INSERT', 'DELETE', 'INTO', 'DESCRIBE', 'EXPLAIN', 'FORMAT', 'LOGICAL', 'CODEGEN', 'COST', 'CAST', 'SHOW', 'TABLES', 'COLUMNS', 'COLUMN', 'USE', 'PARTITIONS', 'FUNCTIONS', 'DROP', 'UNION', 'EXCEPT', 'MINUS', 'INTERSECT', 'TO', 'TABLESAMPLE', 'STRATIFY', 'ALTER', 'RENAME', 'ARRAY', 'MAP', 'STRUCT', 'COMMENT', 'SET', 'RESET', 'DATA', 'START', 'TRANSACTION', 'COMMIT', 'ROLLBACK', 'MACRO', 'IGNORE', 'BOTH', 'LEADING', 'TRAILING', 'IF', 'POSITION', 'DIV', 'PERCENT', 'BUCKET', 'OUT', 'OF', 'SORT', 'CLUSTER', 'DISTRIBUTE', 'OVERWRITE', 'TRANSFORM', 'REDUCE', 'SERDE', 'SERDEPROPERTIES', 'RECORDREADER', 'RECORDWRITER', 'DELIMITED', 'FIELDS', 'TERMINATED', 'COLLECTION', 'ITEMS', 'KEYS', 'ESCAPED', 'LINES', 'SEPARATED', 'FUNCTION', 'EXTENDED', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'LAZY', 'FORMATTED', 'GLOBAL', TEMPORARY, 'OPTIONS', 'UNSET', 'TBLPROPERTIES', 'DBPROPERTIES', 'BUCKETS', 'SKEWED', 'STORED', 'DIRECTORIES', 'LOCATION', 'EXCHANGE', 'ARCHIVE', 'UNARCHIVE', 'FILEFORMAT', 'TOUCH', 'COMPACT', 'CONCATENATE', 'CHANGE', 'CASCADE', 'RESTRICT', 'CLUSTERED', 'SORTED', 'PURGE', 'INPUTFORMAT', 'OUTPUTFORMAT', DATABASE, DATABASES, 'DFS', 'TRUNCATE', 'ANALYZE', 'COMPUTE', 'LIST', 'STATISTICS', 'PARTITIONED', 'EXTERNAL', 'DEFINED', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'REPAIR', 'RECOVER', 'EXPORT', 'IMPORT', 'LOAD', 'ROLE', 'ROLES', 'COMPACTIONS', 'PRINCIPALS', 'TRANSACTIONS', 'INDEX', 'INDEXES', 'LOCKS', 'OPTION', 'ANTI', 'LOCAL', 'INPATH', IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 2)\n\n== SQL ==\nID,Name,Age\n--^^^\n"
>>> df1=records.toDF(ID,Name,Age)
Traceback (most recent call last):
File "", line 1, in
NameError: name 'ID' is not defined
>>> df1=records.toDF(ID,Name,Age)
Traceback (most recent call last):
File "", line 1, in
NameError: name 'ID' is not defined
>>> df1=records.toDF('ID','Name','Age')
Traceback (most recent call last):
File "", line 1, in
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> records
ParallelCollectionRDD[11] at parallelize at PythonRDD.scala:175
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> df1=records.toDF('ID','Name','Age')
Traceback (most recent call last):
File "", line 1, in
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> df.columnns
Traceback (most recent call last):
File "", line 1, in
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'columnns'
>>> records.columnds
Traceback (most recent call last):
File "", line 1, in
AttributeError: 'RDD' object has no attribute 'columnds'
>>> records.columns
Traceback (most recent call last):
File "", line 1, in
AttributeError: 'RDD' object has no attribute 'columns'
>>> df.columnns
Traceback (most recent call last):
File "", line 1, in
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'columnns'
>>> columns = ['ID','NAME_FIRST', 'DEPT_NAME']
>>> df1=records.toDF(*columns)
Traceback (most recent call last):
File "", line 1, in
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> columns = ['ID','NAME_FIRST', 'DEPT_NAME']
>>> df2 = records.toDF(columns)
>>> df.show()
+---+-----+---+
| _1| _2| _3|
+---+-----+---+
| 1|Alice| 50|
| 2| Bob| 80|
+---+-----+---+
>>> df2.show()
+---+----------+---------+
| ID|NAME_FIRST|DEPT_NAME|
+---+----------+---------+
| 1| Alice| 50|
| 2| Bob| 80|
+---+----------+---------+
>>> data=sc.parallelize([Row(id=1,name="Alice",score=50)])
>>> data.count()
1
>>> data.collect()
[Row(id=1, name='Alice', score=50)]
>>> df=data.toDF()
>>> df.show()
+---+-----+-----+
| id| name|score|
+---+-----+-----+
| 1|Alice| 50|
+---+-----+-----+
>>> data=sc.parallelize([Row(id=1,name="Alice",score=50),\
... Row(id=2,name="Bob",score=80),\
... Row(id=3,name="Charlee",score=75)])
>>> df.data.toDF()
Traceback (most recent call last):
File "", line 1, in
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'data'
>>> df=data.toDF()
>>> df.show()
+---+-------+-----+
| id| name|score|
+---+-------+-----+
| 1| Alice| 50|
| 2| Bob| 80|
| 3|Charlee| 75|
+---+-------+-----+
>>> complex_data=sc.parallelize([Row(col_float=1.44,col_integer=10,col_string="John")])
>>> complex_data_df=complex_data.toDF()
>>> complex_Data_df.show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.show()
+---------+-----------+----------+
|col_float|col_integer|col_string|
+---------+-----------+----------+
| 1.44| 10| John|
+---------+-----------+----------+
>>> complex_data=sc.parallelize([Row(col_float=1.44,\
... col_integer=10,\
... col_string="John",\
... col_boolean=True,\
... col_list=[1,2,3])
...
...
...
... \
... ])
>>> complex_data_df=complex_Data.toDF()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data' is not defined
>>> complex_data_df=complex_data.toDF()
>>> complex_Data_df.show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.show()
+-----------+---------+-----------+---------+----------+
|col_boolean|col_float|col_integer| col_list|col_string|
+-----------+---------+-----------+---------+----------+
| true| 1.44| 10|[1, 2, 3]| John|
+-----------+---------+-----------+---------+----------+
>>> complex_data=sc.parallelize([Row(\
... col_list=[1,2,3],\
... col_dict={"k1":0,"k2":1,"k3":2},\
... col_row=Row(columnA=10,columnB=20,columnC=30),\
... col_time=datetime(2014,8,1,14,1,5) \
... )])
>>> complex_data_df=complex_Data.toDF()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data' is not defined
>>> complex_data_df=complex_data.toDF()
>>> complex_data_df.show()
+--------------------+---------+------------+-------------------+
| col_dict| col_list| col_row| col_time|
+--------------------+---------+------------+-------------------+
|[k3 -> 2, k1 -> 0...|[1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
+--------------------+---------+------------+-------------------+
>>> complex_data = sc.parallelize([Row(
... col_list = [1, 2, 3],
... col_dict = {"k1": 0},
... col_row = Row(a=10, b=20, c=30),
... col_time = datetime(2014, 8, 1, 14, 1, 5)
... ),
... Row(
... col_list = [1, 2, 3, 4, 5],
... col_dict = {"k1": 0,"k2": 1 },
... col_row = Row(a=40, b=50, c=60),
... col_time = datetime(2014, 8, 2, 14, 1, 6)
... ),
... Row(
... col_list = [1, 2, 3, 4, 5, 6, 7],
... col_dict = {"k1": 0, "k2": 1, "k3": 2 },
... col_row = Row(a=70, b=80, c=90),
... col_time = datetime(2014, 8, 3, 14, 1, 7)
... )])
>>>
>>> complex_data_df = complex_data.toDF()
>>> complex_data_df.show()
+--------------------+--------------------+------------+-------------------+
| col_dict| col_list| col_row| col_time|
+--------------------+--------------------+------------+-------------------+
| [k1 -> 0]| [1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
| [k1 -> 0, k2 -> 1]| [1, 2, 3, 4, 5]|[40, 50, 60]|2014-08-02 14:01:06|
|[k3 -> 2, k1 -> 0...|[1, 2, 3, 4, 5, 6...|[70, 80, 90]|2014-08-03 14:01:07|
+--------------------+--------------------+------------+-------------------+
>>> sqlContext=SQLContext(sc)
>>> sqlContext
>>> df=sqlContext.range(5)
>>> df
DataFrame[id: bigint]
>>> df.printSchema
>>> df.printSchema()
root
|-- id: long (nullable = false)
>>> df.show()
+---+
| id|
+---+
| 0|
| 1|
| 2|
| 3|
| 4|
+---+
>>> df.count()
5
>>> data=[('Alice',50),('Bob',80),('Charlee',75)]
>>> sqlContext.createDataframe(data).show()
Traceback (most recent call last):
File "", line 1, in
AttributeError: 'SQLContext' object has no attribute 'createDataframe'
>>> sqlContext.createDataFrame(data).show()
+-------+---+
| _1| _2|
+-------+---+
| Alice| 50|
| Bob| 80|
|Charlee| 75|
+-------+---+
>>> sqlContext.createDataFrame(data,['Name','Score']).show()
+-------+-----+
| Name|Score|
+-------+-----+
| Alice| 50|
| Bob| 80|
|Charlee| 75|
+-------+-----+
>>> complex_data = [
... (1.0,
... 10,
... "Alice",
... True,
... [1, 2, 3],
... {"k1": 0},
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5)),
...
... (2.0,
... 20,
... "Bob",
... True,
... [1, 2, 3, 4, 5],
... {"k1": 0,"k2": 1 },
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5)),
...
... (3.0,
... 30,
... "Charlee",
... False,
... [1, 2, 3, 4, 5, 6],
... {"k1": 0, "k2": 1, "k3": 2 },
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5))
... ]
>>> sqlContext.createDataFrame(complex_Data).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data' is not defined
>>> sqlContext.createDataFrame(complex_data).show()
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
| _1| _2| _3| _4| _5| _6| _7| _8|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
|1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
|3.0| 30|Charlee|false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
>>> complex_data_df = sqlContext.createDataFrame(complex_data, [
... 'col_integer',
... 'col_float',
... 'col_string',
... 'col_boolean',
... 'col_list',
... 'col_dictionary',
... 'col_row',
... 'col_date_time']
... )
>>> complex_data_df.show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>>
>>> data=sc.parallelize([Row(1,"Alice",50),\
... Row(2,"Bob",80),\
... Row(3,"Charlee",75)])
>>> column_names=Row('id','name','score')
>>> students=data.map(lambda r:column_names(*r))
>>> students
PythonRDD[173] at RDD at PythonRDD.scala:48
>>> students.collect()
[Row(id=1, name='Alice', score=50), Row(id=2, name='Bob', score=80), Row(id=3, name='Charlee', score=75)]
>>> students_df=sqlContext.createDataFrame(students)
>>> students_df;
DataFrame[id: bigint, name: string, score: bigint]
>>> students_df.show()
+---+-------+-----+
| id| name|score|
+---+-------+-----+
| 1| Alice| 50|
| 2| Bob| 80|
| 3|Charlee| 75|
+---+-------+-----+
>>> complex_data_df.first()
Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))
>>> complex_data_df.first()
Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))
>>> complex_data_df.take(2)
[Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5)), Row(col_integer=2.0, col_float=20, col_string='Bob', col_boolean=True, col_list=[1, 2, 3, 4, 5], col_dictionary={'k1': 0, 'k2': 1}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))]
>>> cell_list=complex_Data_df.collect()[0[2]
... cell_list=complex_Data_df.collect()[0[2]
File "", line 2
cell_list=complex_Data_df.collect()[0[2]
^
SyntaxError: invalid syntax
>>> cell_list=complex_data_df.collect()[0[2]
...
...
... }
File "", line 4
}
^
SyntaxError: invalid syntax
>>> cell_list=complex_data_df.collect()[0][2]
>>> cell_list
'Alice'
>>> cell_list=complex_data_Df.collect()[0][4]
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_data_Df' is not defined
>>> cell_list=complex_data_df.collect()[0][4]
>>> cell_list
[1, 2, 3]
>>> cell_list.append(100)
>>> cell_list
[1, 2, 3, 100]
>>> complex_data_df.show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>> complex_data_df.rdd.map(lambda x:(x.col_string,x.col_dictionary)).collect()
[('Alice', {'k1': 0}), ('Bob', {'k1': 0, 'k2': 1}), ('Charlee', {'k3': 2, 'k1': 0, 'k2': 1})]
>>> complex_data_df.select('col_string','col_list','col_date_time').show()
+----------+------------------+-------------------+
|col_string| col_list| col_date_time|
+----------+------------------+-------------------+
| Alice| [1, 2, 3]|2014-08-01 14:01:05|
| Bob| [1, 2, 3, 4, 5]|2014-08-01 14:01:05|
| Charlee|[1, 2, 3, 4, 5, 6]|2014-08-01 14:01:05|
+----------+------------------+-------------------+
>>> complex_data_df.rdd.map(lambda x:(x.col_string+"Boo")).collect()
['AliceBoo', 'BobBoo', 'CharleeBoo']
>>> complex_data_df.select('col_integer','col_float').\
... withColum("col_sum",complex_data_df.col_integer + complex_data_df.col_float).show()
Traceback (most recent call last):
File "", line 1, in
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'withColum'
>>> complex_data_df.select('col_integer','col_float').\
... withColumn("col_sum",complex_data_df.col_integer + complex_data_df.col_float).show()
+-----------+---------+-------+
|col_integer|col_float|col_sum|
+-----------+---------+-------+
| 1.0| 10| 11.0|
| 2.0| 20| 22.0|
| 3.0| 30| 33.0|
+-----------+---------+-------+
>>> complex_data_df.select('col_integer','col_float').\
... withColumn("col_sum",col_integer +col_float).show()
Traceback (most recent call last):
File "", line 2, in
NameError: name 'col_integer' is not defined
>>> complex_Data_df.select('col_boolean').withColumn("col_opposite",complex_data_df.col_boolean == False ).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.select('col_boolean').withColumn("col_opposite",complex_data_df.col_boolean == False ).show()
+-----------+------------+
|col_boolean|col_opposite|
+-----------+------------+
| true| false|
| true| false|
| false| true|
+-----------+------------+
>>> complext_data_df.withColumnRenamed("col_dictionary","col_map").show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complext_data_df' is not defined
>>> complex_data_df.withColumnRenamed("col_dictionary","col_map").show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_map| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>> complext_data_df.withColumnRenamed(col_dictionary,col_map).show() Traceback (most recent call last):
File "", line 1, in
NameError: name 'complext_data_df' is not defined
>>> complext_data_data.select(col_dictionary).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complext_data_data' is not defined
>>> complex_data_df.withColumnRenamed(col_dictionary,col_map).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'col_dictionary' is not defined
>>> complex_data_data.select(col_dictionary).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_data_data' is not defined
>>> complex_data_data.select(complex_data_dfcol_dictionary).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'complex_data_data' is not defined
>>> complex_data_df.select(col_dictionary).show()
Traceback (most recent call last):
File "", line 1, in
NameError: name 'col_dictionary' is not defined
>>> complex_data_df.select(complex_data_df.col_dictionary).show()
+--------------------+
| col_dictionary|
+--------------------+
| [k1 -> 0]|
| [k1 -> 0, k2 -> 1]|
|[k3 -> 2, k1 -> 0...|
+--------------------+
>>> complex_data_df.select(complex_data_df.col_dictionary.alias("Name").show()
... )
Traceback (most recent call last):
File "", line 1, in
TypeError: 'Column' object is not callable
>>> complex_data_df.select(complex_data_df.col_string.alias("Name")).show()
+-------+
| Name|
+-------+
| Alice|
| Bob|
|Charlee|
+-------+
>>> import pandas
>>> df_pandas=complex_data_df.toPandas()
>>> df_pandas
col_integer col_float col_string col_boolean col_list col_dictionary col_row col_date_time
0 1.0 10 Alice True [1, 2, 3] {'k1': 0} (1, 2, 3) 2014-08-01 14:01:05
1 2.0 20 Bob True [1, 2, 3, 4, 5] {'k1': 0, 'k2': 1} (1, 2, 3) 2014-08-01 14:01:05
2 3.0 30 Charlee False [1, 2, 3, 4, 5, 6] {'k3': 2, 'k1': 0, 'k2': 1} (1, 2, 3) 2014-08-01 14:01:05
>>> df_spark=sqlContext.createDataFrame(df_pandas).show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
File "
from datetime imort datetime
^
SyntaxError: invalid syntax
>>> from datetime import datetime
>>>
>>>
>>> simple_date=sc.parallelize([1,"Alice",50])
>>> simple_date=sc.parallelize(1,"Alice",50)
Traceback (most recent call last):
File "
TypeError: parallelize() takes from 2 to 3 positional arguments but 4 were given
>>> simple_date=sc.parallelize([1,"Alice",50])
>>> simple_date
ParallelCollectionRDD[1] at parallelize at PythonRDD.scala:175
>>> simple_data
Traceback (most recent call last):
File "
NameError: name 'simple_data' is not defined
>>> simple_data=sc.parallelize([1,"Alice",50])
>>> simple_data
ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175
>>> simple_date.count()
[Stage 0:> (0[Stage 0:> (0 3
>>> simple_data.count()
3
>>> simple_date.first()
1
>>> simple_date.first()
1
>>> simple_date.collect()
[1, 'Alice', 50]
>>> records=sc.parallelize([[1,"Alice",50],[2,"Bob",80]])
>>> records
ParallelCollectionRDD[11] at parallelize at PythonRDD.scala:175
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.count()
2
>>> records.first()
[1, 'Alice', 50]
>>> records.take(2)
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.take(3)
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> records.take(0)
[]
>>> records.take()
Traceback (most recent call last):
File "
TypeError: take() missing 1 required positional argument: 'num'
>>> records.take(1)
[[1, 'Alice', 50]]
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> df=records.toDF()
>>> df1=records.toDF("ID","Name","Age")
Traceback (most recent call last):
File "
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> df.show()
+---+-----+---+
| _1| _2| _3|
+---+-----+---+
| 1|Alice| 50|
| 2| Bob| 80|
+---+-----+---+
>>> df1=records.toDF("ID,Name",Age")
File "
df1=records.toDF("ID,Name",Age")
^
SyntaxError: EOL while scanning string literal
>>> df1=records.toDF("ID,Name,Age")
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType.
: org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input ',' expecting
== SQL ==
ID,Name,Age
--^^^
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:239)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:115)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parseDataType(ParseDriver.scala:39)
at org.apache.spark.sql.api.python.PythonSQLUtils$.parseDataType(PythonSQLUtils.scala:29)
at org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(PythonSQLUtils.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 846, in _parse_datatype_string
return from_ddl_datatype(s)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 838, in from_ddl_datatype
sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nmismatched input ',' expecting
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType.
: org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input ',' expecting ':'(line 1, pos 9)
== SQL ==
struct
---------^^^
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:239)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:115)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parseDataType(ParseDriver.scala:39)
at org.apache.spark.sql.api.python.PythonSQLUtils$.parseDataType(PythonSQLUtils.scala:29)
at org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(PythonSQLUtils.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 850, in _parse_datatype_string
return from_ddl_datatype("struct<%s>" % s.strip())
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 838, in from_ddl_datatype
sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nmismatched input ',' expecting ':'(line 1, pos 9)\n\n== SQL ==\nstruct
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "
File "/usr/hdp/current/spark2-client/python/pyspark/sql/session.py", line 58, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/session.py", line 638, in createDataFrame
schema = _parse_datatype_string(schema)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 852, in _parse_datatype_string
raise e
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 842, in _parse_datatype_string
return from_ddl_schema(s)
File "/usr/hdp/current/spark2-client/python/pyspark/sql/types.py", line 834, in from_ddl_schema
sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nextraneous input ',' expecting {'SELECT', 'FROM', 'ADD', 'AS', 'ALL', 'DISTINCT', 'WHERE', 'GROUP', 'BY', 'GROUPING', 'SETS', 'CUBE', 'ROLLUP', 'ORDER', 'HAVING', 'LIMIT', 'AT', 'OR', 'AND', 'IN', NOT, 'NO', 'EXISTS', 'BETWEEN', 'LIKE', RLIKE, 'IS', 'NULL', 'TRUE', 'FALSE', 'NULLS', 'ASC', 'DESC', 'FOR', 'INTERVAL', 'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'JOIN', 'CROSS', 'OUTER', 'INNER', 'LEFT', 'SEMI', 'RIGHT', 'FULL', 'NATURAL', 'ON', 'LATERAL', 'WINDOW', 'OVER', 'PARTITION', 'RANGE', 'ROWS', 'UNBOUNDED', 'PRECEDING', 'FOLLOWING', 'CURRENT', 'FIRST', 'AFTER', 'LAST', 'ROW', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'DIRECTORY', 'VIEW', 'REPLACE', 'INSERT', 'DELETE', 'INTO', 'DESCRIBE', 'EXPLAIN', 'FORMAT', 'LOGICAL', 'CODEGEN', 'COST', 'CAST', 'SHOW', 'TABLES', 'COLUMNS', 'COLUMN', 'USE', 'PARTITIONS', 'FUNCTIONS', 'DROP', 'UNION', 'EXCEPT', 'MINUS', 'INTERSECT', 'TO', 'TABLESAMPLE', 'STRATIFY', 'ALTER', 'RENAME', 'ARRAY', 'MAP', 'STRUCT', 'COMMENT', 'SET', 'RESET', 'DATA', 'START', 'TRANSACTION', 'COMMIT', 'ROLLBACK', 'MACRO', 'IGNORE', 'BOTH', 'LEADING', 'TRAILING', 'IF', 'POSITION', 'DIV', 'PERCENT', 'BUCKET', 'OUT', 'OF', 'SORT', 'CLUSTER', 'DISTRIBUTE', 'OVERWRITE', 'TRANSFORM', 'REDUCE', 'SERDE', 'SERDEPROPERTIES', 'RECORDREADER', 'RECORDWRITER', 'DELIMITED', 'FIELDS', 'TERMINATED', 'COLLECTION', 'ITEMS', 'KEYS', 'ESCAPED', 'LINES', 'SEPARATED', 'FUNCTION', 'EXTENDED', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'LAZY', 'FORMATTED', 'GLOBAL', TEMPORARY, 'OPTIONS', 'UNSET', 'TBLPROPERTIES', 'DBPROPERTIES', 'BUCKETS', 'SKEWED', 'STORED', 'DIRECTORIES', 'LOCATION', 'EXCHANGE', 'ARCHIVE', 'UNARCHIVE', 'FILEFORMAT', 'TOUCH', 'COMPACT', 'CONCATENATE', 'CHANGE', 'CASCADE', 'RESTRICT', 'CLUSTERED', 'SORTED', 'PURGE', 'INPUTFORMAT', 'OUTPUTFORMAT', DATABASE, DATABASES, 'DFS', 'TRUNCATE', 'ANALYZE', 'COMPUTE', 'LIST', 'STATISTICS', 'PARTITIONED', 'EXTERNAL', 'DEFINED', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'REPAIR', 'RECOVER', 'EXPORT', 'IMPORT', 'LOAD', 'ROLE', 'ROLES', 'COMPACTIONS', 'PRINCIPALS', 'TRANSACTIONS', 'INDEX', 'INDEXES', 'LOCKS', 'OPTION', 'ANTI', 'LOCAL', 'INPATH', IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 2)\n\n== SQL ==\nID,Name,Age\n--^^^\n"
>>> df1=records.toDF(ID,Name,Age)
Traceback (most recent call last):
File "
NameError: name 'ID' is not defined
>>> df1=records.toDF(ID,Name,Age)
Traceback (most recent call last):
File "
NameError: name 'ID' is not defined
>>> df1=records.toDF('ID','Name','Age')
Traceback (most recent call last):
File "
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> records
ParallelCollectionRDD[11] at parallelize at PythonRDD.scala:175
>>> records.collect()
[[1, 'Alice', 50], [2, 'Bob', 80]]
>>> df1=records.toDF('ID','Name','Age')
Traceback (most recent call last):
File "
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> df.columnns
Traceback (most recent call last):
File "
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'columnns'
>>> records.columnds
Traceback (most recent call last):
File "
AttributeError: 'RDD' object has no attribute 'columnds'
>>> records.columns
Traceback (most recent call last):
File "
AttributeError: 'RDD' object has no attribute 'columns'
>>> df.columnns
Traceback (most recent call last):
File "
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'columnns'
>>> columns = ['ID','NAME_FIRST', 'DEPT_NAME']
>>> df1=records.toDF(*columns)
Traceback (most recent call last):
File "
TypeError: toDF() takes from 1 to 3 positional arguments but 4 were given
>>> columns = ['ID','NAME_FIRST', 'DEPT_NAME']
>>> df2 = records.toDF(columns)
>>> df.show()
+---+-----+---+
| _1| _2| _3|
+---+-----+---+
| 1|Alice| 50|
| 2| Bob| 80|
+---+-----+---+
>>> df2.show()
+---+----------+---------+
| ID|NAME_FIRST|DEPT_NAME|
+---+----------+---------+
| 1| Alice| 50|
| 2| Bob| 80|
+---+----------+---------+
>>> data=sc.parallelize([Row(id=1,name="Alice",score=50)])
>>> data.count()
1
>>> data.collect()
[Row(id=1, name='Alice', score=50)]
>>> df=data.toDF()
>>> df.show()
+---+-----+-----+
| id| name|score|
+---+-----+-----+
| 1|Alice| 50|
+---+-----+-----+
>>> data=sc.parallelize([Row(id=1,name="Alice",score=50),\
... Row(id=2,name="Bob",score=80),\
... Row(id=3,name="Charlee",score=75)])
>>> df.data.toDF()
Traceback (most recent call last):
File "
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'data'
>>> df=data.toDF()
>>> df.show()
+---+-------+-----+
| id| name|score|
+---+-------+-----+
| 1| Alice| 50|
| 2| Bob| 80|
| 3|Charlee| 75|
+---+-------+-----+
>>> complex_data=sc.parallelize([Row(col_float=1.44,col_integer=10,col_string="John")])
>>> complex_data_df=complex_data.toDF()
>>> complex_Data_df.show()
Traceback (most recent call last):
File "
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.show()
+---------+-----------+----------+
|col_float|col_integer|col_string|
+---------+-----------+----------+
| 1.44| 10| John|
+---------+-----------+----------+
>>> complex_data=sc.parallelize([Row(col_float=1.44,\
... col_integer=10,\
... col_string="John",\
... col_boolean=True,\
... col_list=[1,2,3])
...
...
...
... \
... ])
>>> complex_data_df=complex_Data.toDF()
Traceback (most recent call last):
File "
NameError: name 'complex_Data' is not defined
>>> complex_data_df=complex_data.toDF()
>>> complex_Data_df.show()
Traceback (most recent call last):
File "
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.show()
+-----------+---------+-----------+---------+----------+
|col_boolean|col_float|col_integer| col_list|col_string|
+-----------+---------+-----------+---------+----------+
| true| 1.44| 10|[1, 2, 3]| John|
+-----------+---------+-----------+---------+----------+
>>> complex_data=sc.parallelize([Row(\
... col_list=[1,2,3],\
... col_dict={"k1":0,"k2":1,"k3":2},\
... col_row=Row(columnA=10,columnB=20,columnC=30),\
... col_time=datetime(2014,8,1,14,1,5) \
... )])
>>> complex_data_df=complex_Data.toDF()
Traceback (most recent call last):
File "
NameError: name 'complex_Data' is not defined
>>> complex_data_df=complex_data.toDF()
>>> complex_data_df.show()
+--------------------+---------+------------+-------------------+
| col_dict| col_list| col_row| col_time|
+--------------------+---------+------------+-------------------+
|[k3 -> 2, k1 -> 0...|[1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
+--------------------+---------+------------+-------------------+
>>> complex_data = sc.parallelize([Row(
... col_list = [1, 2, 3],
... col_dict = {"k1": 0},
... col_row = Row(a=10, b=20, c=30),
... col_time = datetime(2014, 8, 1, 14, 1, 5)
... ),
... Row(
... col_list = [1, 2, 3, 4, 5],
... col_dict = {"k1": 0,"k2": 1 },
... col_row = Row(a=40, b=50, c=60),
... col_time = datetime(2014, 8, 2, 14, 1, 6)
... ),
... Row(
... col_list = [1, 2, 3, 4, 5, 6, 7],
... col_dict = {"k1": 0, "k2": 1, "k3": 2 },
... col_row = Row(a=70, b=80, c=90),
... col_time = datetime(2014, 8, 3, 14, 1, 7)
... )])
>>>
>>> complex_data_df = complex_data.toDF()
>>> complex_data_df.show()
+--------------------+--------------------+------------+-------------------+
| col_dict| col_list| col_row| col_time|
+--------------------+--------------------+------------+-------------------+
| [k1 -> 0]| [1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
| [k1 -> 0, k2 -> 1]| [1, 2, 3, 4, 5]|[40, 50, 60]|2014-08-02 14:01:06|
|[k3 -> 2, k1 -> 0...|[1, 2, 3, 4, 5, 6...|[70, 80, 90]|2014-08-03 14:01:07|
+--------------------+--------------------+------------+-------------------+
>>> sqlContext=SQLContext(sc)
>>> sqlContext
>>> df=sqlContext.range(5)
>>> df
DataFrame[id: bigint]
>>> df.printSchema
>>> df.printSchema()
root
|-- id: long (nullable = false)
>>> df.show()
+---+
| id|
+---+
| 0|
| 1|
| 2|
| 3|
| 4|
+---+
>>> df.count()
5
>>> data=[('Alice',50),('Bob',80),('Charlee',75)]
>>> sqlContext.createDataframe(data).show()
Traceback (most recent call last):
File "
AttributeError: 'SQLContext' object has no attribute 'createDataframe'
>>> sqlContext.createDataFrame(data).show()
+-------+---+
| _1| _2|
+-------+---+
| Alice| 50|
| Bob| 80|
|Charlee| 75|
+-------+---+
>>> sqlContext.createDataFrame(data,['Name','Score']).show()
+-------+-----+
| Name|Score|
+-------+-----+
| Alice| 50|
| Bob| 80|
|Charlee| 75|
+-------+-----+
>>> complex_data = [
... (1.0,
... 10,
... "Alice",
... True,
... [1, 2, 3],
... {"k1": 0},
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5)),
...
... (2.0,
... 20,
... "Bob",
... True,
... [1, 2, 3, 4, 5],
... {"k1": 0,"k2": 1 },
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5)),
...
... (3.0,
... 30,
... "Charlee",
... False,
... [1, 2, 3, 4, 5, 6],
... {"k1": 0, "k2": 1, "k3": 2 },
... Row(a=1, b=2, c=3),
... datetime(2014, 8, 1, 14, 1, 5))
... ]
>>> sqlContext.createDataFrame(complex_Data).show()
Traceback (most recent call last):
File "
NameError: name 'complex_Data' is not defined
>>> sqlContext.createDataFrame(complex_data).show()
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
| _1| _2| _3| _4| _5| _6| _7| _8|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
|1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
|3.0| 30|Charlee|false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
>>> complex_data_df = sqlContext.createDataFrame(complex_data, [
... 'col_integer',
... 'col_float',
... 'col_string',
... 'col_boolean',
... 'col_list',
... 'col_dictionary',
... 'col_row',
... 'col_date_time']
... )
>>> complex_data_df.show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>>
>>> data=sc.parallelize([Row(1,"Alice",50),\
... Row(2,"Bob",80),\
... Row(3,"Charlee",75)])
>>> column_names=Row('id','name','score')
>>> students=data.map(lambda r:column_names(*r))
>>> students
PythonRDD[173] at RDD at PythonRDD.scala:48
>>> students.collect()
[Row(id=1, name='Alice', score=50), Row(id=2, name='Bob', score=80), Row(id=3, name='Charlee', score=75)]
>>> students_df=sqlContext.createDataFrame(students)
>>> students_df;
DataFrame[id: bigint, name: string, score: bigint]
>>> students_df.show()
+---+-------+-----+
| id| name|score|
+---+-------+-----+
| 1| Alice| 50|
| 2| Bob| 80|
| 3|Charlee| 75|
+---+-------+-----+
>>> complex_data_df.first()
Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))
>>> complex_data_df.first()
Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))
>>> complex_data_df.take(2)
[Row(col_integer=1.0, col_float=10, col_string='Alice', col_boolean=True, col_list=[1, 2, 3], col_dictionary={'k1': 0}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5)), Row(col_integer=2.0, col_float=20, col_string='Bob', col_boolean=True, col_list=[1, 2, 3, 4, 5], col_dictionary={'k1': 0, 'k2': 1}, col_row=Row(a=1, b=2, c=3), col_date_time=datetime.datetime(2014, 8, 1, 14, 1, 5))]
>>> cell_list=complex_Data_df.collect()[0[2]
... cell_list=complex_Data_df.collect()[0[2]
File "
cell_list=complex_Data_df.collect()[0[2]
^
SyntaxError: invalid syntax
>>> cell_list=complex_data_df.collect()[0[2]
...
...
... }
File "
}
^
SyntaxError: invalid syntax
>>> cell_list=complex_data_df.collect()[0][2]
>>> cell_list
'Alice'
>>> cell_list=complex_data_Df.collect()[0][4]
Traceback (most recent call last):
File "
NameError: name 'complex_data_Df' is not defined
>>> cell_list=complex_data_df.collect()[0][4]
>>> cell_list
[1, 2, 3]
>>> cell_list.append(100)
>>> cell_list
[1, 2, 3, 100]
>>> complex_data_df.show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>> complex_data_df.rdd.map(lambda x:(x.col_string,x.col_dictionary)).collect()
[('Alice', {'k1': 0}), ('Bob', {'k1': 0, 'k2': 1}), ('Charlee', {'k3': 2, 'k1': 0, 'k2': 1})]
>>> complex_data_df.select('col_string','col_list','col_date_time').show()
+----------+------------------+-------------------+
|col_string| col_list| col_date_time|
+----------+------------------+-------------------+
| Alice| [1, 2, 3]|2014-08-01 14:01:05|
| Bob| [1, 2, 3, 4, 5]|2014-08-01 14:01:05|
| Charlee|[1, 2, 3, 4, 5, 6]|2014-08-01 14:01:05|
+----------+------------------+-------------------+
>>> complex_data_df.rdd.map(lambda x:(x.col_string+"Boo")).collect()
['AliceBoo', 'BobBoo', 'CharleeBoo']
>>> complex_data_df.select('col_integer','col_float').\
... withColum("col_sum",complex_data_df.col_integer + complex_data_df.col_float).show()
Traceback (most recent call last):
File "
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 1182, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'withColum'
>>> complex_data_df.select('col_integer','col_float').\
... withColumn("col_sum",complex_data_df.col_integer + complex_data_df.col_float).show()
+-----------+---------+-------+
|col_integer|col_float|col_sum|
+-----------+---------+-------+
| 1.0| 10| 11.0|
| 2.0| 20| 22.0|
| 3.0| 30| 33.0|
+-----------+---------+-------+
>>> complex_data_df.select('col_integer','col_float').\
... withColumn("col_sum",col_integer +col_float).show()
Traceback (most recent call last):
File "
NameError: name 'col_integer' is not defined
>>> complex_Data_df.select('col_boolean').withColumn("col_opposite",complex_data_df.col_boolean == False ).show()
Traceback (most recent call last):
File "
NameError: name 'complex_Data_df' is not defined
>>> complex_data_df.select('col_boolean').withColumn("col_opposite",complex_data_df.col_boolean == False ).show()
+-----------+------------+
|col_boolean|col_opposite|
+-----------+------------+
| true| false|
| true| false|
| false| true|
+-----------+------------+
>>> complext_data_df.withColumnRenamed("col_dictionary","col_map").show()
Traceback (most recent call last):
File "
NameError: name 'complext_data_df' is not defined
>>> complex_data_df.withColumnRenamed("col_dictionary","col_map").show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_map| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
>>> complext_data_df.withColumnRenamed(col_dictionary,col_map).show() Traceback (most recent call last):
File "
NameError: name 'complext_data_df' is not defined
>>> complext_data_data.select(col_dictionary).show()
Traceback (most recent call last):
File "
NameError: name 'complext_data_data' is not defined
>>> complex_data_df.withColumnRenamed(col_dictionary,col_map).show()
Traceback (most recent call last):
File "
NameError: name 'col_dictionary' is not defined
>>> complex_data_data.select(col_dictionary).show()
Traceback (most recent call last):
File "
NameError: name 'complex_data_data' is not defined
>>> complex_data_data.select(complex_data_dfcol_dictionary).show()
Traceback (most recent call last):
File "
NameError: name 'complex_data_data' is not defined
>>> complex_data_df.select(col_dictionary).show()
Traceback (most recent call last):
File "
NameError: name 'col_dictionary' is not defined
>>> complex_data_df.select(complex_data_df.col_dictionary).show()
+--------------------+
| col_dictionary|
+--------------------+
| [k1 -> 0]|
| [k1 -> 0, k2 -> 1]|
|[k3 -> 2, k1 -> 0...|
+--------------------+
>>> complex_data_df.select(complex_data_df.col_dictionary.alias("Name").show()
... )
Traceback (most recent call last):
File "
TypeError: 'Column' object is not callable
>>> complex_data_df.select(complex_data_df.col_string.alias("Name")).show()
+-------+
| Name|
+-------+
| Alice|
| Bob|
|Charlee|
+-------+
>>> import pandas
>>> df_pandas=complex_data_df.toPandas()
>>> df_pandas
col_integer col_float col_string col_boolean col_list col_dictionary col_row col_date_time
0 1.0 10 Alice True [1, 2, 3] {'k1': 0} (1, 2, 3) 2014-08-01 14:01:05
1 2.0 20 Bob True [1, 2, 3, 4, 5] {'k1': 0, 'k2': 1} (1, 2, 3) 2014-08-01 14:01:05
2 3.0 30 Charlee False [1, 2, 3, 4, 5, 6] {'k3': 2, 'k1': 0, 'k2': 1} (1, 2, 3) 2014-08-01 14:01:05
>>> df_spark=sqlContext.createDataFrame(df_pandas).show()
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean| col_list| col_dictionary| col_row| col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
| 1.0| 10| Alice| true| [1, 2, 3]| [k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
| 2.0| 20| Bob| true| [1, 2, 3, 4, 5]| [k1 -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
| 3.0| 30| Charlee| false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, k1 -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
No comments:
Post a Comment