antrep1234
0
Q:

create a dataframe python

>>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
...                    columns=['a', 'b', 'c'])
>>> df2
   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9
7
import numpy as np
import pandas as pd
vect1=np.zeros(10)
vect2=np.ones(10)
df=pd.DataFrame({'col1':vect1,'col2':vect2})
5
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df
3
import pandas as pd

data = {'First Column Name':  ['First value', 'Second value',...],
        'Second Column Name': ['First value', 'Second value',...],
         ....
        }

df = pd.DataFrame (data, columns = ['First Column Name','Second Column Name',...])

print (df)
6
# Python code demonstrate creating  
# pandas DataFrame with indexed by  
  
# DataFrame using arrays. 
import pandas as pd 
  
# initialise data of lists. 
data = {'Name':['Tom', 'Jack', 'nick', 'juli'], 'marks':[99, 98, 95, 90]} 
  
# Creates pandas DataFrame. 
df = pd.DataFrame(data, index =['rank1', 'rank2', 'rank3', 'rank4']) 
  
# print the data 
df 
0
# Creates a DataFrame

l = [('Alice', 1)]
spark.createDataFrame(l).collect()
# [Row(_1=u'Alice', _2=1)]
spark.createDataFrame(l, ['name', 'age']).collect()
# [Row(name=u'Alice', age=1)]

d = [{'name': 'Alice', 'age': 1}]
spark.createDataFrame(d).collect()
# [Row(age=1, name=u'Alice')]

rdd = sc.parallelize(l)
spark.createDataFrame(rdd).collect()
# [Row(_1=u'Alice', _2=1)]
df = spark.createDataFrame(rdd, ['name', 'age'])
df.collect()
# [Row(name=u'Alice', age=1)]

from pyspark.sql import Row
Person = Row('name', 'age')
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()
# [Row(name=u'Alice', age=1)]

from pyspark.sql.types import *
schema = StructType([
  StructField("name", StringType(), True),
  StructField("age", IntegerType(), True)])
df3 = spark.createDataFrame(rdd, schema)
df3.collect()
# [Row(name=u'Alice', age=1)]

spark.createDataFrame(df.toPandas()).collect()
# [Row(name=u'Alice', age=1)]
spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()
# [Row(0=1, 1=2)]

spark.createDataFrame(rdd, "a: string, b: int").collect()
# [Row(name=u'Alice', age=1)]
spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()
# [Row(0=1, 1=2)]

spark.createDataFrame(rdd, "a: string, b: int").collect()
# [Row(a=u'Alice', b=1)]
rdd = rdd.map(lambda row: row[1])
spark.createDataFrame(rdd, "int").collect()
[Row(value=1)]
spark.createDataFrame(rdd, "boolean").collect()
# Traceback (most recent call last):
#     ...
# Py4JJavaError: ...
-2

New to Communities?

Join the community